From 370d3ffa889299f784c0ea315ce3f925627ec80f Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Mon, 11 Mar 2024 13:40:07 -0700 Subject: [PATCH 1/7] Fix (d|f) and (f|f) sn-K contractions on the CPU --- .../host/obara_saika/src/integral_3.cxx | 3049 ++- .../host/obara_saika/src/integral_3_2.cxx | 7040 +++++- .../host/obara_saika/src/integral_3_3.cxx | 17945 +++++++++++++++- 3 files changed, 26761 insertions(+), 1273 deletions(-) diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx index 961c3bc1..ac71a8ed 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx @@ -1219,124 +1219,913 @@ void integral_3(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); } } // cleanup code for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double xA = rA.x; @@ -3680,236 +4469,1814 @@ void integral_3(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - } - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { - double *Xik = (Xi + p_outer + p_inner); - double *Gik = (Gi + p_outer + p_inner); + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SCALAR_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 0 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 1 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 2 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 3 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 4 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 5 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 6 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 7 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 8 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 9 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 9 * ldG), gik); - } - } + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + } + + for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { + double *Xik = (Xi + p_outer + p_inner); + double *Gik = (Gi + p_outer + p_inner); + + SCALAR_TYPE tx, wg, xik, gik; + tx = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx index f43292cd..b415da7b 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx @@ -756,167 +756,2203 @@ void integral_3_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -2323,162 +4359,2198 @@ void integral_3_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -2489,162 +6561,2198 @@ void integral_3_2(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx index f78c490d..62521b11 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx @@ -1228,167 +1228,5838 @@ void integral_3_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -3739,162 +9410,5833 @@ void integral_3_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -3905,162 +15247,5833 @@ void integral_3_3(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); } } } From c2f5b6eb6bd2a17e5413bbebdecf72d8166da2b6 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Mon, 11 Mar 2024 13:54:47 -0700 Subject: [PATCH 2/7] MIN -> min in f sn-K CPU kernels --- .../local_work_driver/host/obara_saika/src/integral_3.cxx | 2 +- .../local_work_driver/host/obara_saika/src/integral_3_2.cxx | 2 +- .../local_work_driver/host/obara_saika/src/integral_3_3.cxx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx index ac71a8ed..cb1c330a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx @@ -2125,7 +2125,7 @@ void integral_3(size_t npts, // cleanup code for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double xA = rA.x; diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx index b415da7b..f34e192a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx @@ -2952,7 +2952,7 @@ void integral_3_2(size_t npts, } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx index 62521b11..09222e56 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx @@ -7059,7 +7059,7 @@ void integral_3_3(size_t npts, } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; From 847c23db71fc806da2c3833ce26944e226c796b0 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Mon, 11 Mar 2024 13:55:42 -0700 Subject: [PATCH 3/7] Fix (p|g) ... (g|g) sn-K CPU kernels --- .../host/obara_saika/src/integral_4.cxx | 6568 +- .../host/obara_saika/src/integral_4_1.cxx | 3422 +- .../host/obara_saika/src/integral_4_2.cxx | 10356 ++- .../host/obara_saika/src/integral_4_3.cxx | 26510 ++++++- .../host/obara_saika/src/integral_4_4.cxx | 58790 +++++++++++++++- 5 files changed, 102633 insertions(+), 3013 deletions(-) diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx index 0acd4495..7b0ab4ed 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx @@ -17,7 +17,7 @@ namespace XCPU { void integral_4(size_t npts, double *_points, point rA, - point /*rB*/, + point rB, int nprim_pairs, prim_pair *prim_pairs, double *Xi, @@ -2810,174 +2810,2038 @@ void integral_4(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); } } // cleanup code for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double xA = rA.x; @@ -8503,336 +10367,4064 @@ void integral_4(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SCALAR_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 0 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 1 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 2 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 3 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 4 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 5 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 6 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 7 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 8 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 9 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 10 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 11 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 12 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 13 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 14 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 14 * ldG), gik); - } - } + SCALAR_TYPE tx, wg, xik, gik; + tx = SCALAR_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx index 8945bc3c..42b1e7b4 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx @@ -726,222 +726,942 @@ void integral_4_1(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -2288,217 +3008,937 @@ void integral_4_1(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -2509,217 +3949,937 @@ void integral_4_1(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx index cba4ee4c..08d8d177 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx @@ -13,7 +13,6 @@ #define PI 3.14159265358979323846 - namespace XCPU { void integral_4_2(size_t npts, double *_points, @@ -1199,222 +1198,3253 @@ void integral_4_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -3705,217 +6735,3248 @@ void integral_4_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -3926,217 +9987,3248 @@ void integral_4_2(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx index c3b0d68e..e4541197 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx @@ -1878,222 +1878,8638 @@ void integral_4_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -5744,217 +14160,8633 @@ void integral_4_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -5965,217 +22797,8633 @@ void integral_4_3(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx index 15b895e1..6a702fbf 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx @@ -2819,222 +2819,19398 @@ void integral_4_4(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -8567,217 +27743,19393 @@ void integral_4_4(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -8788,217 +47140,19393 @@ void integral_4_4(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); } } } From 37488d32acc7d77edeea70fdcaeab4a94f514cc6 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Mon, 11 Mar 2024 14:31:16 -0700 Subject: [PATCH 4/7] Missing MIN -> std::min --- .../local_work_driver/host/obara_saika/src/integral_4.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx index 7b0ab4ed..7bb8f02c 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx @@ -4841,7 +4841,7 @@ void integral_4(size_t npts, // cleanup code for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = MIN((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double xA = rA.x; From 52736979033eac3cc012115ac00d2895486d9347 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Wed, 13 Mar 2024 12:47:09 -0700 Subject: [PATCH 5/7] Added BasisSet::max_l --- include/gauxc/basisset.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/gauxc/basisset.hpp b/include/gauxc/basisset.hpp index 7ad1f105..0e0c659d 100644 --- a/include/gauxc/basisset.hpp +++ b/include/gauxc/basisset.hpp @@ -136,6 +136,11 @@ struct BasisSet : public std::vector> { return _nbf; } + inline int32_t max_l() const { + return std::max_element(this->cbegin(), this->cend(), + [](const auto& a, const auto& b) { return a.l() < b.l(); })->l(); + } + }; // class BasisSet } // namespace GauXC From ba48c6b21064b6309be5d4eb7950f07dc406365f Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Wed, 13 Mar 2024 12:48:12 -0700 Subject: [PATCH 6/7] Added H2O2 def2-{T,Q}ZVP tests to check sn-K with high AM, disable L > 2 for device tests for now --- tests/ref_data/h2o2_def2-qzvp.hdf5 | Bin 0 -> 763512 bytes tests/ref_data/h2o2_def2-tzvp.hdf5 | Bin 0 -> 155064 bytes tests/xc_integrator.cxx | 21 ++++++++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 tests/ref_data/h2o2_def2-qzvp.hdf5 create mode 100644 tests/ref_data/h2o2_def2-tzvp.hdf5 diff --git a/tests/ref_data/h2o2_def2-qzvp.hdf5 b/tests/ref_data/h2o2_def2-qzvp.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..df736635f51f80bf15c2f19012683b13db978eb3 GIT binary patch literal 763512 zcmeFa2{=|=-#&bq$&gGTAycT3d01V`oOzy-WTwHGDJf(uW5^Iiij))$8Z6hMWN4yN zN(vDr%21}npF7w4_T0~Xzt8i1-*J4;|9G<evF`t5U_Ywsg^+Pcgu zcvlb>7kYXEHG%byRr1ID(h^$2_(v9bn_TX+v_i3zCM>0;ls{J}2#bFN0{2q>s-^b- zT+hlvM~ARbliWTzUD}OMO8s+9oSdg)NrXD8NXzlE@dwulI9QmhAvg0SVA%d z#m|f-ST6i-elAo5);~(P7BU1EvNQsHy?40^ngn^fxCboN`JWP2dcsm$f&@E3EnnXN zw_QHYf$od@EEN5Z%PK**R7ag4&q$D;ry>xx{i%(A{B{3xbAQ!_{h!+SvyJ}T%3rlX zvv6jM7Nex3{KIhcOLnB95d7&_2$BEL4BOI`ELN_Z8FjZbG1aCZ(EgVnDO}p?A9ng> zLvU%GmF#GKDf@L*W@$s_=&!l|yso^IrJnfDWxwW-zazl~+y76C_)?+Zk^iaIa@n#4 zmMySsfn^K)rz}AB=YMznzbn^pP3(ETlUDtY&Te7n%d0)@?s9yG&1U6 z$K-O)4O44eB^Vd>{~3SSEmiz;{GqtC139H4jQr8hiqR4X;tR2niJ67I@lu^%V+*6D zyv4ZX*L*W;i$7wg#q0VtU&F{uPh&A};a_s$KXT!J@s!B%#9#IKTBbU>x{GxdD*n6s zm|Ey>)i+t#XW`rz_gRcJ7LP;jPXFpS3&jgo`n&j|kbSY7K=2Fvng0CzbA0h@+4B0b z1(q$aY=LD9EL-3|ZGm6o3l19s;VPYd)H(lI_0oIY&RpD2)Zcu3%swD{S>1b;ODOHN zW9kzXZyY}hgs6+>>|6Kds+0N`=iI4OY6JD{Y(?iZf)&*FEM3rBC5hax)&zo6C82mR zt+20fpN$|8ZZ4(WODUONNf4SjyI8MjDc|Yx&;1IY&Hjgpd28D}1c^TOBLQ{XX;E$I_pkToF@=a^L zL$2Fw5w@49)$b0iC{bW$R*ydOL&u?XmAbQxfd7h<9Y3owD7!NHj zb}c`z>*iAa$kKNAkbgZr>|{P-`g!k%RI%QzUiBLihdE{f9-wo%}a`ho+neQTlO@3r&`6R z&Q&+iUBSnq-k0^^W9tN?I(0~3+2KP}zukTDB1V42Nk%-s->#`a_`Bld+a_7*ce}^n zGEV-{q5K&qyW~tQhN7SUwm9)*BhVtgX^$HMa4#wR*2wu?1ClUa^W{sm4RJzQMA{S?q5XwNm7MC!Xuo<4H@J-?rb; zgzvo4Yb2NZoqvIoiU6-;!{MKPTYnNzQngi~S9GS=K3iy0WZO zBBSAFO7uiyX<4WAD^mUrC(AmeWu4N$gOg>Q(y~tJ-@(bUPH9=E^jA1p)+znt z{m!yZ=~t92JRjzldE(?)cJY2^S*NtDQ~DJte}|K0ozk*S>EFT0vQBAPr}TF?`Bhi2 ztW#RnDg8S*S=K2n>y-W-oGj~4{o@=qJQ~DJp|9D^gD^8YmO3ON> zUy<_beetqR>A(HSvQFt&l>E0pAwS<|v2>AVV-i?pDfGfqJ?N6SD0LoC5C53X^^Ad6 z{cf~Jdri;}_`iMLXy+dOZ8s^K;Zv^8E=ApTczl1q*Y^Bg7%|mo)1W&H|F_Q@{a4TB zB;!PjQa4*^eJ}i7ak7#6w9Pz2*D_9)+{&My-%7?wS6|S>bk_&JEly1A&*ykueYlL1 ze}NO%rfcPSsn>p6oV59?akj=)FXQAdaPsRoAPUUjrnaH{ZR3+jg2xwA4p#N$_~a*0 ze(iUD#fj1p4X5|5gTJjmnUe}N-lRXa>`(rMKj|&bs~4nUFKsK z-lz31ak9+EF7vVf4o;T&*kwNU?{M-f&$i6RF7vVf4o;T&*kwNU-@(Z;AG^%Q{vA%1 z`PhY+$Eu;&UIx{~+Ed zD1Y7sUHW5-K=`|qYT=FDi#7kBzo(lu?Pt@1R~B(fUjF$#-CpFP|6fuH!hgyhGMYcR z@&DWHvG|7e1)Na)#0U9anVkN=$Aj5dx<6X}^~sV1!aw67^p9HqsgM60KJ1q4^M4mU z{!`z>@^P0fuxx?H;$#l%;*R{1AqkNR;3+w_uIqiQvA6 zr!cLu>y;DlD)C5U2IH-Zg?Q=Q-Hd_M!hX31bTi=6>F*Tg8tF+ zNv?(Ypfzl|d@9HrwlWVlA_YIF`<_d1@2S%unQl!wcUK^knu*=RGP)ZEh}LBeaUTUo zGV~&zFztkM_)zLvbOcpCD7;&E>M-JI%%YcLS_c{0j#B-gp@6Z2eaziKyihX2`Upqi zVK5i}ozDJjIJkduCEe~2A?%j8$>Un9Hhkfj?ZT9AfW>=gM9$b*0=C-7&-6U;NQx(V z|H%VeVC4=6LF+I}$m{ThVl2Q6XO|To@hS@gw4-hxR(}iy8P%1qQ_7El9EI9^$KrM( zXk6{Ot>zt3RPy;)@+%5xH>!<9(RVkWVBK%RsT%JljZwO-5Z2$7XCkAzk$Q5bQ zt7G##@9BLnl;C;G@Kk(g(!?h7U663-Qw41! z6OP$K{p!_&L{xg)$TM%fH>~aL`53f=8?)@bd9t}P8YtU&Qkn+(LA04pw)m(%Skv_W0O)4C%Q9Ej8N7BI*O=qP_d4tidH)Yb}L4H6{ z%z*f=L=a2cGM_&5g$~Eut!wY#7$zD^F|Mh49F5LkLbJ*bBS3tgZC6mQhfD6&oQt&E zgbz7*8)(XLz!D}Fov7RCNLb*boe}gvgDcNU-WE{DU8+xWhqxqgGeP^b)?Q&8cl;{n zwOBV4C%8q;Yx_}jw7`_ZeD5LP@wSeZdY&6zoxZE)*f>ah`%;se|La#G<%f!I-ZzFw zm*+jNaHM5`(}gKdRsaJKYv;6{{fIZ3a5h(r*v|*efoKs|94&MbKOZ*!>=yBTMh{~i zB?UZV?$;n&cMN@q>$Ls2eLs*b<8&47hz1uMy;jBy2O{bX&)+eOPLYbD^1ml?ydyQ$ zyjj~Pd5SbmM|&gwLMJKGiK=U4Mw?`}+VZ8{g8OOaff?D;9$ErOwA&0n; z3JZk>CJDwjtQ4>TWnKi@AKFmFKDI4mG^_>4z@s5nr$GY179b!AC0H8ih_bl@kHP(F_GHG768e206zSvW}3KEXD*1v6Gf_-Z? z7jQh$gAre6w?BR204bs`=%i>n!}7P@o$dy5ctq@z*}c6h;gO7W+5ETcftlzwM>n=c zVyI#JhHxK6nAm5_?Jk~=%x1D44LvJ(-(X_=NQTFG7rDr$blMkCI z53{5p$?J+Ps>}z0sOTeq(Jh}zG9}j}VzciPH$~nH9(p%K+(zg+Jr$gQ?%aCtKKx}g zFlV=*4Z6aCMYhtW*vZV0n*5p?)+e&VPf_b5dmcrhFYz%NmUm(S_3?d2wjMV`vJI)? zPk6hCf;9&9whk?%_g#rA@49sp)rBMY##6J1WIucIyb-a+lv2&|LGgvz z=TjLEqwmF{-HHrpl=kwlir?~6F~J$N>ez>beXv07-9yvx2KarBVLgd*tX@m8goFQkNdps$5?S8T4z;#U<BC%IPlHthj;*&}DGLoQ%Lj2gE5gtV zKl}t7w?OrN8IR_R-$`EssN>D3B0*K(MwdOF>QKF#kEzb^BT4)7op0?=;y~0r?mhFm zny{Ad{&d7V5A5HV;^EvI2Ra)pjt-C{V1;~h;Mt7~I6Vi_=YI$VH~Wkz&h8MzGnCF1 z5j&00)4I+59Ns!0^BUig{Mk`bLdhe}%X_;?SwE=X?7RMu*nfg(x}DJhMK;?>-;dP* zvsVe;yMy@gr_BoV3YX_e8%6ehUJpBo6vce;n^v~1)C@(3N65$l{7}@R`(GbM=Yz}Z@)>L zZ2$NyA)}Tk?uUy+PuwC!PRZS+4DkZj4$Ym*VPpi?ZWRb#r`n1}0vnA-y6E8tY482j z;)6t)SC>3RD^8H2^1n_AGd?G-a>=1x`OqEZ_1*7^Ofv=IJKxYt0{7|<% zoJj!LAi_&UrtPiKq}s<#)3$k^iE|Nk4@*s*kkQ@8LpNd=ARndL1wINQs$VltuN*4} zw6@ZQMe7X^W%Whaqs`tC)#j(F=Cx)>f!5#GvDDiF`j%JwTAbDb((X)WbH6d-7fyy< z=P5ss0ymdrs_m;JafmaP#XA5bx;2hvm_ii^c&33kg{`0>^QJwccM|EtlwmfDXB$cR z{fFLCy&J>`!`R6wnpEb&}o7Rh~%|7nmR8`PP(ZPKdliy}AVU9=E5O-!k8OiglG4K?Xe zkJTOx#QnT5@%U2$o)B|XpD}kuR$})bQzw~&57wf-W}{C@O(>zW$@vn=v$oINGQEW; zHAm&r8)1(gKHE2jgzSJ@xW%~r^UowfmawMJwD(Cv!|Jb}-uO!NPZtjN)3ZjOojwd2 zemxL=+SZf85qJ^&7;2VVZxsl`d^A(?#6qFao|=Y%xXYl=gEd>= z_I{`)eagP}sXx^BaUFNMoC_!?U8Hx-xx(}9wqc(s)6itVS4rXLacEdKmnSGm1gdS9 zyOCkY3WsIJn6=f|;T1*#_2!caU~^q>!|moMu%c}CsfD;8X0#Z+&}Om~9^PT#qbaJ7 zg$oZV$!>H2k1`VF37eA8sh5MsRo}M29J@E(3GUX=_Qw9gTSG>eLgYa>+rda+`@}~3 z!^k0Ub_L-mJxvxM7`R-|?|M#RG3o3LVH+m~+1Os&F~9(0?p3ICIZ@#K-U5aR4nauo znCWU2!-L3mV#gPmh6H5Qap&sgE9yAUvQPA)r6R22I?%wsUIx15Fsu`cQN(&RF$et) z#-L3;lC~Uo!x8Ths;@6zMgq~Ok6Rdb?L&to9Gq_KUyU0qSumC03<(y=s)xhxM5d3e zR!)pmaQ6V?dTG1@e|Jx+UyNt>*CX~tKWh=#;qg_$0Pm>dnOk{Y4j&tSLzx}s1l#X+ zRD}HSgdci()uz>bA;HT&x2cQ?KaBh0N=iD0sIF$W>Ynw7gvy-c?zmY}s@AS8j3xe}uJxwg19Di5v;9(}P8@3U^Zi(> zV2lm+uV7H|J%D6yn_k*Bu@6WVeWOXu6vjm7r@L>Cuf+53#czmRnjp@qu6(%Wast|+ zY&_GB^+9*Ic(wI=6a2h4H+nF4GiDVTc(tK`9VV}q8F4kuM!Hw0v)GjV&{4Js{lMb} zc=wj)B|f4uxKrdt9Vu=-cKf92I~TPJMVf>^>h(!OKfcync++|q*cn9C(eUs>pLe1m z0|UcEjkJJ3+RNXFPY?O98y0*fam#b+zp%;yef?vap|{OIMB`@Nv?zb1p6}DNrb-5~ z%d7Stq2h$v5V()F*C*cTI-cRyI79YOc8kY4hGMXy56WgWAWg? z+|_(c?QSbzj!#Crh`>sioWsrfjn@>;w5T&>d$_<5acTq2P44ivu_Nt<6eXM_e^rA% zkpphpvBRdp#2FCccfM%Ya*HUHTk))aojSC^{`rB&P9ycFC8f2;hKa@0_bS8&x5D(p zsu>X$N0DGw=)p_3Wbnhmx1)Q;GSK;RECbKj4ui@snrq*>&5|N{BpO%Fl#a7!C$#`s$yDt0H(ytUZ{ zy{u1v+x&JXZ1rMowf&ZkF1R1B-#c#sQ&N4_UE+;IKl<6~!rFq-3l%?GT?$Uvp(V#$ ztx5qqPNc2zAD$t0@>O^_?e_az7;MGaL;(r^CO3SfK0GMc4LDN zh)cX>wv{&)psnv7ncB3HTGQ%@Na!7LBGz%vz2G}>2m7a;79L+oPS%uSZ}@j1)j|gv zU;kZ*;Y$3f>bL`lTK~cR^Z;pmXnz0d4+Ig&F!%k@wlxB-)}F(vm=5>Uo84rsr-T~+F$vwaJHi1n|M=bGYPcB z0&L(>`#lu*F7ALMx`MtJ4hZ7FYe_5{%|cPf%9M*5tWHp!;bBo$9v}9Jxt(b(9tny* zoNB(c+67v%GMhgvR)(dq%5#$`hXE7ikE;u}_|QuK@eFZB7(0Esm1^SQ32dA9=d`pF zvCwnTFQ{%8@_jEWXCvwe81Wf zuXx@!@?x=?ZaS|XPOMllHPdEchiv=0=-nzJ(AD)7l!+$;Kz>!Os+tKsG(Y`mrla{4 z@z(Jdp1c-)M9R!{=hMaClG;W-&PFB2gO@Qe!%gK%fTjE}4aHLr)Y;FX)GsXvn>aTW zI+PM1!70mk?Ti!g$mcJE#yR6e@cDLSqh~C-;{51ce!VYPL0?}WheCj)jCIp%6)$w9 zH(`L+{X40A-_0)-(Qil{F{gajN!}&NhVpMsJL;^ z!V?SUT~~jdY6@3;n2O(GrG&@(j^Bw$R)f^xY{##hHv?l^9-UPiwG(j4%aWtKWkvetBelnSC#Gqy7sTO%Qt9)Fk;VPlWnZDJw^~_@f zHLM8Q;@=o&0@SC7HlMqzNM9-%@7c0SL${$wrKvk* z`(y?q!OZal?6eLS74oqe?}$d06Az1yd;0^GspGj|evp*5#w{#tCn9mWMBGuGA0S>X zJKe;_br78|%A&m_8w#pKbi#O;R$$gv(}NZPBP55Nq|#~emGJ$vi$IiN5ZWy1mFd72 z214vt)AgCDqnrLQCCaYtL^JBh56?JXkpfS6$34nLL^_v_C>`BPL^3~1<_*a4EZNUy z39}@iGdJMq;eyyU=6mp3!?)%D<7?1HsHON`PYTZ4hUT~5zl^2_gJmv6r{WWH*9XeM z5j+TPncm)7f#N;o;y>mk;`py@N^A!baof`qv{tUy(9?=FvI%#`AHwv0$cH+PP(Wtpd%24-;?{Jf27I>tP-fgO!1X#MY8a7R) zAz7JP>J>%Npn1}_s`m;XJO$kud?v?W#3MmXlSdHIb|!5*ut zlyPYFW}?tK`c=0-SYf>v&5`Gym_qSNgR-5Z44}N(`}x`R?ojw~{Ilb0j)2-nM@RQO zj7FDb>{al)h4}vFqw2(3e%R1lJYMy11wKG=uI-!_#tMU_|+*)s?;-kjnPu^DEN;jwIZZZnss0 zhw=qDB(?VgyR*AURp$!9Q$?$zTXi##u6FrYvxN|TeK?RGo}VSX{gIy%dGiM;;1k@N z*)~HI=J2f%4=+VF^iwnrd+kvY`)JG$p=iM3V0&&?ur5v-oU1X@+KAPK#OHSmj1#H$ z-DFp(;=oD?{3M&SD%G}I|O$?4pai?*d%Y_U0d1l$Rdquqa32s#LKmZo&k zL82yqzOCXov3pY}Yq3ii$!3*){O8F!;@1<0q`y6kgD>6J5z0tMVb|ktEiS+pv$DTx z_L$uURXlkM0^}Ufm)qw)rV1$?1T3SkrJ|9J2?IM#f5ecx`l!@Jc|61*aX$6F1bkt1pa&jdgiMbM z;u)txKq~#rRT+v%@OU4qw|{p$O3VH_bJwwk$k!L|d(oK-p5^9doGG0ktv=nK(x*-Z^DYs~vzl_?)uSTlQfSpQMheEF+j7xPAqfpC~?PtwE&R?T6Cnbn~08 z%@9>c&ysacnd6VD4^{=@N{}`zjEd_Rp`*r|r!1#zFv)4#KydR3kQyeMxt~EBwv*Zn zNjzr(U&By}y;wRbdi8Awuh%LV9&grmar*>uCyhe~?Q0f{4x4FR4lMv2-M5<8^XDS_ z{&lRosW!q1g`T%s{;ZJOVf6D(f*js`UB>{$WP!xk*_)p|^N=9Bz6@icBS1P9pG7*S zN#%#{T>zt0uojGVz1v$roTRwG-!=V~L>>nt^Rr~$fE>?~{p@h+W6h7BpF(`v>(HsN zJMaU8x5cs0BK+l1tX|`6EujCN>mM_giJunSn?LDQfD=555?&eCfQoM&gC-is@cHJK zje!XT_~Bj`hH#(jfH#i$ILon29O?8vbI(9JzSh1v@=*8{;A($xYF_s+KHfqdqHj?S z(ynNaZy=lnAmRC@H(QBVNwJ}t4R6LCG!LcA6-DrU%i7>g;4IR4*ixl$b`m{t^nHAA zhZcN1x}jv3t~1_vXOm$^q%Bk*QzrE%vw@DsY0vq&&V#_hZ9ImX4e|HyHP?+J{IF_N zv5e^hTllK@1T-ngL!Be}2W3?95PP?4gwE3|Nb&u!n9o~QIOLT7O{J0tx81H5_!cLI z5rY!!*5ZML9r=yiHW}dY;91q|=3HR<;7P+LrgLDZTCsAcj~%3zSbvYZ+YsxvdFv)y zX<~&p^K|1wS}?UGF_zBdGzg!qE@SP<2k*3$Hh%sF=MAH~NZU0(!0kJ^@N92R_VFJaP<8K|uI|D(ScOjP)FM9%;#ovO7kGnI zN8{&<@vP1OSJ|SU{TTKBz~Rwo{4_x8>+{T`IFcepVJ=VzOF7(6ImLJuebrMn>uoxQ z35R!uWDaU!t;#!Afp_bW8=I4fVq^vu%#OU(7wm%{Nl7NjhSi~N5Og)+b0og$x3fs- zoejMFN%fP)=d&n=8x}il^nj5;xkSN-C%}uC78YmTW2og)*r5_B4R~v#UX(}9W@!I* zSdTZ67hiI7p8djG0vyIC4!Y7GMqJkJJw$eIXtJ-p=-a?T+!MXYj8{$quY2Od`%vXN zsGSnoGSPbgoX^e<|48(MNm0+4np`cQ;pZw@_6yq3;6a!S4ObipwPt<3m9ZSSi_39X z*`7l8v4;)aHaT3}$X9eD@$ay;L1S@X%dS!5XD=nmc;~X3ja2@H=^51@W)EgZp zan$;l9s`f2ZF>xW9X@ueE698k#D1T?`OaMGA=1r8?B+Wmg5&ZFZ4EW9f|MgW$NkE4 zP(Eje$FWoQ&&kp5qa(eSH2QGCH%kx)3;EKMJj|L zWzo~d9Bo=Kc6K-=YKI&4$Wh5Py`Bj!mI`Ix9hBxR*=UE=phF!h!Q#L$>sAPDv zQYU{fn$bGJblU`C+VWYG54QnSH#iWflgW)$4vQR|{SXI|(Kp(|mlJ?fy}hTr*l`r9 z7RS}7IYUgMzrW!PzYtt0lB_cEWfiVH>~@gjy9nN%oxL??)Ek%N>pf@Rco>JtTr}c! zjlrxj?@4qktzeF-YTf7c@;D_-T;%ejI8D@YvmzTU<^Lsuir-(xp}jG5l2Ap?bduo+-5fl2$rYNJvK?Lu$!-3hk zxJln`D5(Deo>lkS8+&66+yyqg>B>2nBiR!;FrUM#OQ$R3WJf`M8m{SVI)QU5^pnb_ zvTzv7Ac42P2k6$`K5apffRj9`nj(eHf+C&0oTpslK%dag_(E$@JhJ0jY@Qf5wg}rl ztrZ1vtz-0%;kI;iQzPk<`pH=Eq0u2lqHYy*zc_IEaJmtm_^9+G!PXFWG#YKo?Z^em zXJ+TCY>I%4fqqfo1{*9ZU1Qlw=ZRyqLf&xs$>E&%yOsDzC`x(D8fP^fje==>=T^}d zpr^6x6x%z6F=eVJk(4z}O4D8^8+qwAasIO8i`A~ec&|)VN_64@aB-e_DBxxi@L;ii z@4veUu&%yJ#V51|vaqbVTCh_KvphQbtPaZIlf4$QAz5tDq(thh@pKlr1Pp@S?#Kh; z(lRU8ln0~L2X41Gbol||8W*}VsT7cjfko2i%n(Ut#c4I+bAu$kj}*h9B2+lwlQ)gY z)lp(=SC_HWg5Kc7PNv?)aj-Eq@zO{)+U^YkW_?PXWkV)8OALZ*f7_;ZfF`bA=@bG$Hx}LUF*~DT#*IP<|@vKryFoxoo8eXoi?6|I(N_G z={h*xcSNY&nh_5?Dt)Aya2hnnaz5Bma2WBc&%{mJpGFo}+bkWlB0%_5U7L0qJuLGW z(Kmm(7G5vPuQ4p0AV$`|t>jicPKvkr_PTL(8;M6?W^RUZ7w&UdsdFMe1s{q%FLp~Z zAAe=9sctRY357kvpRP#I!H@gX54W`+1`Ml_9QQ*RtgSN8DxGHz=V{i3zmBa0+)T|; zZ;w0U3WwnPp4RqIf6iZUDyAAJ@N7W7_dW52HueoeKEbH=U`ri!js|L)d!`qw!-wnE zHNQBsLk5;62x#v0r-!E=DNBbrmLeB{Vb)0b13;3!A}gs#4c9*@tqmF0gOY~tPUM{y z!8a}PLZ8;9f@2QSAC7hT15KAF(dx!2NCz0N?<``4jR#Y$n?i)3i{;0R1|2~>8;*qC zchF!JdU5B`rhPc@gIXsmAs)ZU5g#;veFW3n*_C+dyF*fFti%f~dCaLzQnI}ij|M{L zsGGiSgg(y$MRFY+F!vGu)UvKhK&*L}ApS@f5>C!VHE(mo>2<;Q^-u;-7}>Oq$Y%@l z6(8`8g%pC*tLQEhwU42DYe)S23Rz(9qv?yH0)nt@z;Z+u3gHenAFCdz)8ONi^&`?| zCy-sq@wAsc;*k0j$L#Ja;?S+bsFadj4xio7m?v}eIM~Wi9rkWlKB{eKi`5KkzEd3ePqWMTz=(ESX>*WnN^sq3afg&2on#Q+Sbw|R|@!|@G^;cn} zc*6Wq-+iDemv4<*St1NR!nm#Gz$w`34vYEK%F(*dr=ArR#=`5-Ztw1z#GtFQonu|% z@j%|Yir*p>;1@P2`N_TF7&45l;Ms2uzm`Psa(5Pk!6eO!{-I+?spA{1lkRrR@N;FN^e2mJmbxM%jIFs{x|oTb6T}#P6Jzr2yNWaS{TZ9gMuzdd6G7ZJ_vi6FeO$ALWi#EOP>LF44_unUI@Itya zai?2slfCMY;}u6aRkKqMSUeb!LM1kKn6_{t6l zLP2VQk>n5+94TIXf-Oi0wheK-n%x?YFFt=M{efmb9^xxj^H)EJ0$*I0k~|QEt!h{0 zG31BfzUbLiQR(3TSq^(Xn-9WM6Q-S3EIwG%MZI;Kk_%`Oa>!ugi^MOsa$S^Ti^6=5 zua9axwFP#wn;N%|dtvjGO8(i=7=$F&yvb8PiZbs;S}?cMVg57f(^D^Ga0Q28jG-PM ztPM=+3ozA1GXYrb3;z+&FLA|gKv@P8^v(%aP1<1-0k(UC$CO}A=b*!}fg|W;zW?WwwvC3@g0pK9EiABTT9;}vpAD+6? z75HuN)Q~;464LTpJv!oL4!t-}44m_o#2wc*e%!L157u9PQ9ulJ0+uRzY9l6TXew~s zl>Pb>pyJp@hwFDNLFUUMff&^v#2m-gwCZybcw~O+XtUxJ$>}1G>79FJ#MUz3z40NS2 z(sy(Cjjz<8pFa%05N!(Ee>)4Ut6`UX-{}X%?w#GJcRm2Wt&dav_&6RZXwK{lxAub* z+S@qK9nA#Hs|~gazX?H(Wp9)v=0)+`n}8FMA$r(#1tUe^GaWeMeC)^7(Nu8Xm>Q*o z1S6*}Qrklk*TXECQ*V;#tZ;wwm}a`IE;K8Ndgk*W4z()XjLc@*0#=TNSlUk=2I>#% zTQ(01C6x2X-HVtZ%QV(;3IZ zU=yE|gUz0J<3Yt?W;q7fef!b6X2VP%xM73hhztwnAfjDG>XxwRyzz=N=W9*!&wi4-b^zskxqqiQ3*zwV`)^Kk*#aLFoED_H4`i>lU&|{)jYWn( zQ6p^%{J5yi{dy2LbXeKrLYJ9{jAmoA>JtutGx^&YYjxzXlWn&mprwN>w3Zw5(l^6* zM*ht~FbtLSzO2t?OaOY19HO=Q9np{VZ_kL3MoBiw>XJk;Dr|RK>#I2hE$nC9pqZG! z{9pQ6vaXA)Hz1FLk@;CNZ$OS`$$qv`=iPeS&RW=6vU;|J`yzVFl%;*O{(9onlY2SEST{e0^nUeM>>n6%ve3W+ut**-tn0Zy&@L^G*>8Q!6n zVD;BL4=OdR8E7W7;j`+mDRH%;ke0JL-E$-i1VvBtOgcw^Bo_;&s?dGN*nT?p(SZe? z@LD+O&7?78(!JoRrW6J?cmtQY>T-naygzg}o&hZH(g!5odcxZe4THuO@Y&=zuU+}M z1s%TJ@53UrDP1IL@J7oSUHh7kWwX5Sjlz2GScC#@=gW;anwV`kmA%IY?Y;i;Uqd2U*e_++3#sdOrPI_nbE0m1+_Y$owz<-b@MYYPC1ysb@TzUO;!%J zc4Weqr*>0qV(1|jsqA38rMeDYJmwyk zhUq%+vp$l4zW8dHa&gFIQ2NGoVf@DTbj}z**G15DZdg$+7v}ydcsvOSRGxTa;%ol+(SMT;(hNHoC zLlG1k4}mn>$9wbQLy_*_cVaE)K1AajF|fBf1}J6RK|AktpiX5qSNi5j^vV5mn3~m7 z^osF(1n0#Jq-1&8$UJfzw0o&^fhEKrB=nDdQm1+hNOd{AyllZR+(v23tGN>NWt%yz z?DuD&;*=6eSSKFx4%x*Xd(w-Z+q{W7ORs|&W7fV*&)y6lkMy%dS|_8-)|D>QNB1K8 zFCWtUt#_f(5WVLvEgrb%x@dV#g8>Y@C;#n%R3X|q)73lbbq8>pe5N#@5ZDx2R6NTLP%-v|`i7wgC zC0Ynv0byAmD@11+z@=|iKKzoDTiU9oSLfkEu!);?2($U*%Xhijn;Nn5)+xmLT#x1+spYtm`7{4annQWPX;+8<68!vY)LA zuxILo7olI<%A~BeC@5C0S=KXl9r;R)ztPIQ2=`s}e%Egw3FRYcDQ?oGAdbiJ1csOk z&`(59(vvzGLh%OQXZH$`vFFzBN-1eDl$j-PbX6Mi{BmIbjlF3L<3=Xchkdl*(5mi> z^mX#EbiUWm$B-R2Xqat3Cz=CPtd%z@U(7-otV;gfkM=@2_3B9Zc1xJa!+D|2Lkbrq z<*3?xi9`L@2Kt%%6Of*Em=Ie;Ff4qSqPRcU5te$q6BfCyiobp8_AE~hMMMdEmABn4 zKw(HskX|4Q;L}p)zndvQrC{d6E;m_mri}CkC$Dv|mZCh%NxGf%#BIigJ0S$*=k+I9 zL`5R*L(!Yxp#0q#QDa4R?jsxgw?HleO_;^)wYVu^Hc_hRB(joA zPS(HVg1WufX|AXn0XE&2nnkKIZbR!n9QSCm^AO)@lp<(2%U4s1t4Tbwy;X;Bc%bC9H085MIj-%9OMNP? zUAtf9`mJ<88PXVYvd{%Zd$ASlFd@P+L9V_pE;6w2s?v7;x2#x7i%@Zg`xMHjY7H

H!F4=dw2;S9w!-E@%QRkdbNM8vYZ&n`g!Wj;Uck6?Z?jBZKX|rHZ zcE@Fs%|HhF=m@e7Z{owL4xJl&a-?u)pJ3xzqa0AHA{~ssQzGsv3Udt+J@~e*A+X*C z;LFr<_l%hJaQC~rYqj&D(X%J^FI7V$&}n_g+h z$ts@pRrmZrXY(%0jU}TbD_%+SCPy(?>t4EkmbrnbS}8A-^_UHl=Yx^wDUkKEWL+0o zZ$KUgBlELl-hdp>lKm`w|NTNgl^d`iCC_=ESp^WAzw;rsCkn#*b{C3IUxA;KvZGbX zt^hSKS>G#^1iOo7{H;!(glEp2@@vyRiD;8jL)=_q;l7uwvYpSOkbM!|Zh7A$00rz9 zt}Ic*gq&ihhbzT#F_TM>423!TVQljD!PjEI)wRV&tRM}!RMO}_4BCbTyAD|dJpdljb^AH~96)Nx!)T z9MXHsueCmM0Es-~D|CXPNUvIS=m(Pw%yiLr=}G!b%y?=hg!4kbs`X9L+joM z$;vEH(3kUt{!%KMYPs6CRcHrrpgLuLzE=shZ~1)rX^lSQJohyqJWd!_nZAxjH5zcr zNMP%QuRh>V=bh}2iKmg{GYAfSz@=PweN{MKB#Ub zF!g$s6m~jrCn9@L3-V9Sb1m?*xM}k3?nU0rqG z4c>}xt^|eeT9Vci!*J-z@l)OfdvQ{3-rTA1a`5rmPl(Skkj3PTwO}*VIYlz_k6zX9Rd*j!6eK zzn}wBdFJI?8XE(YyQ9s-wh7{@DkVeLURzwWLV_VrRS|ZreT7m!9YJIJ64W*h_@GGJ zh8dqOQ5+Mz!OJAo7AsfzfMcqn_#EoHuz_|rGB+?&8C>8Mw!fucZzYt8-0HJ;W+YMK z;E~EJhQk6dce|RV*ZFZ`V8?smE%r<}%kN;cCu``!d}ng!`Br=FH#(+wp?iWs=w-VsQFE(i#|g{7ON)&Mx46R~DzWrlO*wPI;6%x!|RL zPya-V6{vW4?zSzUf`*L+KI31+abeqh>I05RlGnhg2&dw!#FxcQ%!df)h|;c^RTaGX z@V3cp;+Ro6%)ZaV`60XvCd99g=x)%2&J!_aM{*qSOul1>l2jDp{u<$3`@$UB)-y;I zNAJfkL$f()j5APsSNI;o6hC;|e3x@7R{-X9qOMAl4@Ng@LoNo>`@#FY%26Cm8KCfv z|4NC=!D#Qx1$@+bYkfx4}5{4xPgsjY8MXQRN3EegSV z!?@s0huH}M-*{wM81=)ZCKv=|{#flKrez_il^$4YiQyuIW)t6ARYoDQ|k!GXNxmrMska zNzmYARm`o>4x}(&SX9RJ8qD=wlsH&;3BFs~+3-;G9B|?1dVD%g8$P)!N5$A1KKTI>B3FE=xo0@eKmtSWcrqt&mCz2;e9#Cu2a;7(82#V9K9_O>}x_&+?Ihd)<;*v5&BBBN{-va^z8e4Jxb$V^7| z$O_qIWG6*NGK*w~h|Evt*p(0&AruJ>g^=~*`S!e?^ZXCz+~+>meO>SCdi$r9*Y99f zkU-2qoi-51q-exf^A1$zm?;`}$f2dz%h2gCe%zkp3BY9&`U3dTyFw^^2!6zM9-brQ>RbHXRH+P;~7fFx&Wc zhjU*pc%Lc%`C(=dlmhNG@qJ|=wN(Y5G9u~r;|<$iB)2g}&EaG9iN6rn!?M5C)Ok^_ zIlq@9C70nsGjgWoqB+={dHgVG^E%+3DiBqHK)FB6_EldSfbYv__GdQ+VL$1l_jjE9Wxk4zir+{n+@wD7 zzq&zCeIY}IcsM+2uV|nRzQzq7>;+ILWj{^a7FiRXAE7?1OhOTVH^Rda`jHL<`t0^{6 zz$0%*j+f*w?mgAbLF)#keZ78u6_i2yo9CUHp*?O_l)zwL7z4w6`fY#hR>Dczw+l&J z5s<}|u~;$w3|fh6ljk0F$JJ8D%MA=$K-Ian#*F9h0k7931EIlg*kRp#|E~2x{4|-6 zRLXy!5XH)<7o8hC=w;5eihOY=a3?f0=a;B1Jn;C>`PWDCf$2Vl#Z%vc;q{Xb=wGdT z#wZV1k|Q+2sL?qc(&f?^Z7ZBy{wY-s)|Bf^HC|M}(u!98N9YT9;DGWM&kT}3P0>5k zW^)vwIjrYyD6@&B9$-)GP5X{<7X0yjJT8sPd!b-P zX6N(%6G6~famawN%m>a{<67!TdB9q2yFuOX1C)$qo53>HVejc3@0-H|a584fN&Dk{ zcAU z9j{~2t50gi?%REV;T$E|dl~Au-@Ao^7==^lqu`rf8IB1sY_XyAaEuq+SdD#6!RZLI z4XtaJFFNC~sZEP2;5<6UUL(Xwod>rc*8KN+y%qFbR(h%WAr%TNFic&cP(tbXPqK(f zp2ijQMZyi!$#9oGYri|*;qXx!(A^Rz`5an@86q}QVc_s=e#74@(DYVi#w(f^V1moB zua>eKXieRky1z}X^q3Tlx}ZG=!Z?z0Vg+x3X}cJq?zf`&)$Kdh+#(OaWW#7sLTMeK zTORJG30B9Qn)r>MAFPEV(kbnSrN4m*a}MkE6-|Br7#)G`U}Hhz_RM8U;0t7(x@oK`_g4pH=ak0 zN8kXyI$6;2$xI2ouw+hl<`@?q{Z8;`9^E6DC#bXQ(_{m^4{%JGVcLM^fJWDd;td$b zKne5dPov5s^PVpreZ)?5z3}7<`GO!J`%XA3?jjrP@;_&*zGAl0lawUB0q!4p)Q6P2 z#WnoTn2_>d_?eRYAyQ5O7tG6NxTg|=J{jpeMe=o_wxRnji%tjNJHhw;{@uZR&dr*d z9Z81j2eI*gxW(F=t;PELL}wE(v#gYiRPZMH!r z%wLMHf2o-&ID8Wi;Xq~1mZ!jv(lVb{nUe5D*Xv&^C`f+X%Ssci_E%sCHO8^Hi=YLo zM^s)_OQ8+zec9D(jQF~c6R*j&TzLE9idl(l>v}!CmKQ%r(>x?IiDaH=?|Vu4>NQx*sQr3l2{)(e}APm5ynS zBj{at3Y=5l{uTwMsE(iet}TO~BxAmjZNY-7T->(rP~gOEe_OPZ+pS?!Uk??1c$@|% z&0dz!b%ev{DYkvTLv7&Gb$x*(pUZeBrLCh9DuhZuRd{vznH)aXU>kpakP@Y!sr%S% z83zB*-TQ5*oC$iiZNpq|W`X2svoo*ceW2&=&nl*DX|(K)T5EsLJhuB~t)lkn5Hhnm zTzg6V53(@AQFL#*2(xJ9oj$B|9rqhd!jGMb#@)E*6pG(n!xbXkp53PcTYgBG&p%5yN-p;2SJ&;M>_L7OfK`9CCI!mse_3$z1fq~|X| z^V|nnxSU@z=9%D(N?ETtlMMyIJG55DeNBO2LUO9tBkertk-aH?5OJ8e|DF6@qNG{9BZqON*{YvWg_!f#0biG*a%w$y#; z;n3qz`&Xmm5#SvAZvD0Fzlg2!py=BZqIh`!EmmiXMr8YzX~>~@cGU0Uqg?3$cYNkW zgQVbY5N^LiecnFN8=s}ou@9>A#Wk$_mzf@)#kb5N!tu5(W70hVwuO9H zSIiCYnduGckR^h0FZ4>!>YuA_9ZvpHNE2_+pm=WV90T`7>6E#v#sXQ-3^H=XYv6T_^4J;66^!in%OUkQ zf@rwY_i&*v9mo^FU8_67j1wFgf^R_NZize?ky9Z2vxKjU@EZ_!FhZXtbOU0aCFU%r zmiCPIb_L4fs;BC4aUQg?q1>CQap)a#9_G}wbX2;-V{_Vl1f-lBJ)M^kfpR}^6QOJM zhXJo@o1dmd0}A7-uRDumaJ~Pmy5szK@r3=Ql;;oWpxGAJ9huITg1PLNw|&keeP&p4Z|dMJW=JpkIJ;8S9)6xNYLU%sDNMdNt*hr#fn&l(yp2 zol0D|QUL$p8!ttaM}_;ut1WXNs_DM|(=!KFKY01N&gK?etC$uXYl{Qnh4+d`(PTtd zS(h)7fd%D^J6m6RcLiya`!0JlcMg-HTsvQ8OM!+o`Hwz{KZl2}{W%)8$AN!#FcR_g z*2hi$+ppPOD2Madzeu7zV}bV9)EJTcvnbt5mp4TfA$a&sm8~(!Z}2>EjDA9Y0?I`Q z_59Okg@r}^JV?3$(?W~y)`xMZt81O-Oq#PT6&0VbleT#F0lrUJo2Sad& zWGNc@>(}whSuyH^odYn3lI6+nJ|`TXq_h<{7XT&b z2wr(Cx47j8Gb*!x-^Z*kp@6ZENg!-P7n0eyX|O%w!C&vQ?hOtx#u=5{y4WO7p+nN! z=P#=3K$&N>Cqx!WJ^&?qjt{4(=r_(%8;e5**p5WKHy|CvE z(@s%D;M-axtXu`Tb~48rXo-4@ZWgVubWe!eLrfIrxnI!$lGO7dS9#n5;g0>48i zB5L<;!Ru0NxQB8YP^SHB-OnHfqqENZy`}jdQWIHREq{_9m$~Zb?_=->tMce|9o+vA z`OMNkStMPK@N)5`I!&jcl%0=^*~yF1l~AkHg6Tq(T}X0k{;xdhZ=-IAWM9NLwC)_b zd%^*f(8PUwPNR!j2#-47`Q(eYoogyyew6@1N7GN}hP$C(0}k^PIC|h_5ofmiY(&{=vsfLf4sVQ*ZRYuvLF`pl< z^nxvVY~Fs8CxKa$rTETY0kqpGwYDK#A1_K3!98P@(YwlKM(3-7AjSM`)t&uzP<%jS z{i078I3I9-iLC7yp6RMnNG-~WUlec18#E9>AL$$sc_Ph%%XJK|;!#$pT*8V#Zm&N| z^SEo!vNs$}nqzprYU+TOq<g zg#yU99{Na`l?_)e^VbcfGC-MXt53c?QVLgn%pa~N$>VM?L&=52r(z(U{TmGeg~J(EQKI;cP4n^oGx8y>JjGeUDu>6nZJq zwP;O^>&zl(%ktBavxZ@iO5jrDIXniKkPsc0WQ<_k*-lF>%HNo`L&FutcRLu*u`1&CXk1YACJ?{|JK<=(weY z12e}BfJL<0i_C<9(N&%+C6?Y$#HceR_qzi+@n4qAcjoixu^ZkuL(hkUjJ}DLSy%?` z_hZSg#GU{~#pd~`O1LCj4I1@QR)<310lV7Uw2l0kwWV zFS;VAjYcxLces%J;G8$+dB>AY(L>%pj(^n50Y|cEM8MKRpvQBc;}nMvR5SSTVSimI zkQ@-bydyQ`WZi6)jC+rv1ujNtiI4#d*V$vX+_(gupzepm z?gRm&rWWcCp$>reSYoVUY&-kulxRnO+gWkXXR?s%fPMVaa1?M zeY{zIw!QG-SExq~oNF7{Py_n4P_-A*IFI!{!?zy|;c~dh#~KMc*mc9O;bM*#Jpba| zZQA{3Q4@^JIxNx-SBl?>WE=B=D>GaAR{RyPm+MZ}L6aDGaXjk*eU}`%^<)2oSvyyp zdi_Rr-m@0KlX=_Sb8rBR@hTkuw*CfMKjetL5`PFE%%{BA@MR9MsOeCym^q4GJ$F&X zwBH_>%ZYnmf~t_>c7ijcd<=FhXRsTwLC{;@F^1K58+&>8Wko(BjIJaX%6c9%LI)35 zS3S!t0pYVT6{&hpq4WA~o`Ab%gsW5bFYxii;CDdtmYrJkQ5WJ?M z$XQN)3u>}9a{K)K(5fzrtiHFT{K}`|qMP3Df$;5bA^Wmp&~3wtuqEvoi1=|t`Q22* zPn|!TWlh>^XKnHo`xW9LXQX{zmbE*S@D95+ndJa2CCz2~gq`rdBj`p}xi)%x=DUUe z%STY`$|ct;WUXL$K6zJ_D-~wfim@w1DWckMsPCO*lg3>t7!E`+kl{D1b-o%Fg~MMf z&)3j1k06y-uheR53X}{;W<0s!43$F%t24ho1N%}YGhF{`2j}-EPs(qSDH*NLKmN2k z3#>L%XpG-P1Jp+^XOva~FYB~aY1B&tBOY7V`buko(0A=PPj4;!Lt?UOnP?5HF^QeN zufG7U-d!+}|5 z_YZ49l0Vnv#xGBDC0s&x=-~1SE6$p{qP=~w80G}z8+ejsL;97{jkZ2G*xHiX&i8BNI094hjVw^AkGX;FfG&tf>V?~%Ushx`dFdFi*(omc<~E+WC} zA~-Sx-+;*75_vEpr$G2;311iCHz4j{gg#5?2E;x~%-NdY9~37~m7-*pmL0>5G3b0x zKw|i-9!Qz&!l2dytnZnvf10S|{NW%ovv(Q>L$#m0qGC>g)iqio3< ztVQ_co9p%|K*=8*;p<9*BL`mC%X_+_F(R{}#zKZ@-W&Qh*I%nh{x`do+z&~xgo#It z)*uXqyPx_c66A+keA%QbakoM-o8ceJlu~%gT{}Ma>=0O@V#wIGd>-Wa1w^Q8r-DFy zuTn8g26fxiV~?_9#EW-Ub#p&+p>iX?O-|h;p%f^`x9KVOMhgMBBJ8EXWxu z*)_Cew1qg=s-ae#8ul<#Wt6_oS8 zDT?tmEp*}X>I5hC5tLWmhh+m3Gz$o{zx*hP ze}K}VN2c5$LqVO~*i0P!=EO8&sS*Zwe5K<$+NANruGUWG6Gu=f>RCUGoE;aYqm1tn z+`)c1?H`$CPX{CVzPTp@qM+(Z=R{tm73AbR!TQU@9LG;w+1y+gMo;Dqh757}+~43EZRj&^wC+BX!fIx_-r<%AN4l58n_f51*R zPVb9b=lOEKUq+kX zNu@sfZh%rhvdCIoDu8K!U007VTtQnD6uw_(P=GEKPl8p8t^hc6_w`7O7@8-we?OP< z3EbeLp+R_}AU?!;bmw7sE|}8zn)=t+5wbJ+>$RJac#xI>QF5v%KCx1t5OYTz*LXdc zu1|dnuA4D(u*&#D&y^ra?Z@Gu&B*%Ec;pfd=RdT~>9e3d zVu#5B@?3Dr9Mj+DPWj?e6JPwq?Jwggk2LDr_GDq0ar`oeLbXVHG901EuJN9Y z0xOcR*wJ_tvBP*3Tv$GR z?{e@@%%^EeB`RDDG>fy z!q-Lk4Tw7!q0bV!0kO{#bM|r5><{Oh7<8tfX`jIJF5p~cQM8aqauV;dY%9L-fam(B zuEwN01BpT z7M(Jm(BLeLs*Fmj2Bn>JO}Ofj7FIM^lfxfi;!~4d-UL&~X09R`U7JSoyi%D}HH*aK;3i zsUKf>Q8Ib?+9>lVU{)3#hgXNf_>Sheu#0YBqO$qvWQ8PZr|4#X;k^)=fB6yj7YZ`; zZq>xV?Fb&U(g6giKXiZ!Va>okEDoN1nA`Y8+8-Jzb=4V*UIBl}693cI*g}dqyE!xz zmXL0(54GWgqlj5H7pD3sAJMUva#%>2M>6&$<-zc6eCXM~oI=O`KJ~PVPk>+%CN)d)b%hDN zVEN}aSUa>@l!%l79mXxDLXtkKuxBUpPd5mc<8X8A$p@-cq4)mHS+xs~!gKG)Qy+Q8 zL!Xb)qxHlkqzJ7DHt&QA{Tqukz?lMIgVYhNIz zx61*%qj64v>;g4fEGYw`=XjbfD?f0z z^wq$i4=F!RhthhbOOxib&IgOP?07hB`x3{uFG%K|K%PO3q2Wi>rjw^`c(8(n4+!g##~3eOz^ck=kiPc=7EZLZ$EC(nWG>3IMh$v z^#FF3K^Bzk+OYoPi5S1nD_B|MYtdIE-4;2SN}ozAfXW=FG52D+3{72qGR&hd0wG^r zk;RKVXitn+WlXmMJ~wO;EeCngrvW9IlhjwC{d4{Fs##-beI-3An%oQM>yeK=O5cw+ zQ)p>ETt9$Ec67C>++s$JGEV4}>#SoX`(|vH@7bb!Z1SecGk&O}`R;_K&kfZ7jM-dR zf+;Q&nPPb}g&FP1r{$>PN=)+{j1FQ zN1ZqR4%!D%^_=>9>4xDDm&mT2fUZF5+Qr7urFpC;^5c`KPHsG$^4{s!f9O$PTek4e z!KM1JY$lPcu`S zpvosEj>)?TqSm7f-!f$!L1P-3=n1QA_-1>S66Z|;>gO(h*HSP=UrAoq%-eSY4HmuW zhfn4cWHsH?cA0ThrL5#JzmMXG-VJe$c1&7i`kka;v zmLE}j$V~b5;eCu}fDg9`?UQq-;KclvSn+jd;BX(5@~EAKUs4h_u1AD{q2Y7>R5V6_ zTK-9xX+u10*E7@@cgO}54*MNgiyy;l*wwU~Z%e@gZkg2#wtDE9>)|la^asY-mOQvB zR|aE6CH^<3AneVKEv`iaj}a67a1LK+9Z0|bpWoo__^xDx2)9Kh24Uz>W zFYoSg4bLEFmpdD~PAy_3*1KFbbnh__A8*c~Gskg{yracMd&bbRf76?_(i^__D!VEF z*9p`F^Mv;`cf(&5m}(=PXaR|ApzE79uL17Ktb|B^DT7%Dla z$97tQe#5E2lF(Ja^{RlnVR{(cFtUuY8VmsceN+4r^hb*1XN)pg^o<4It{?Ao z-z)Swv__G8Tk>GU^92tUdK;&+TJQ$HvApEn0uZ!>- z5O**_pCxnyVxJ}E>_=f>_dI2*))j@RbdcB?wjNSRkcHu4;*OHr9{Fj*Q^Ju^@@;4Fo9>B;>* ze_9Q!zq{gL<8BN19&PIxs9Zwx4AffIbu`heLlsvp-#UVRyTytJp0I;I(duvCqb|Tp z(XXg%CxgJ&OU4gV8(ipvOO=W*kN?3A)h!*Z|MUyN?o{7qPoPAN@=B6Ct_F~}Y~Hba zzpui#N@fABZ;hZf7sJiIE(B%%>{7L_i5fMMnFzIeeGH{pGIn{z{smEAJIw2MEf79u zdAhY(bPf7_v~%9jae~2iHwIW-LM9&0&F-1==(v=Ow z%AyPA%ufU_@!;!C`V+nj7vXKegXxDnufp~xUU7q{HkjFAF6eo~ihG3W`E{KmLsd>= zx4LD>@Sic>$~>aeSYA@^<%f47fCSYQj>C-ZaMAY*hpvb>Xns6*ZT>yLO?VwoDIGkH zF8utKk)pzf-*fFCi+)9cX7v2#;UdjVt+WpAd#eFJHo?hfS}K~vO*eejy=xB*EI*Ch z$UjUX#y`B)*S3b4^`AC(0gtgM%edANl3%Uf-}DUs<9004Os;%3Mja(1^StZVXOAWv z7kC|jd!i!h8-}WDrugMTBV*S|W;8zeHrm+h3nX=d3R}5^(eSOaTd&`bA=X~MM$6Wt zV4isUNjll{Xt>?GV0E!QY{4?9q}`SeSXi}&+qIFnK9`v8KYM2e8oKkT$Q^|N} zxHzeAA)_sI;1xTr%`sp#Tl*Q4wNv)7x*7_8bJS|%yvL!GrI#+sCx%Aam)gj%O5n@S z?|s`@;>EqaMF##8@`56-T%&Jgm_Vbqbu|VNSAassyrZhvXDpgET(RU9BkoOo$Xx0A zI+F6e#YH4<1?hUSU+AogF`iVxUQg>{hJWThqZ;omfr{i=b^hmUiR-Xtno2Ya;`Kal z4Gtf31*bozym18F_@K@erwSf9eEcu#r!b+5u*!Bpr-F3u(PpR0dc!4(@19-g_P%}> zh?$Tbtzp*1Q^d1_Ye{p$F)9 zeixF#YXCSce;%x*`+(V1S$~(`e*oQROZ#0i*^d0mRu519x{49KHAJ5R!TBcmvji8B z;B^rk8G>&>82vP^f;1*XvHC`>ZT!lz{xc2y>(K-@B3 z7QcZbSlEWI(pD*EVk4-eZvYVrq5ij6A|Irx4}E5OlLm$>M;fIr@xsqy8DX|>-YM>k z<|yq-^&=B^L=4nuJ28cZTuBYiDddbU-{EsCf+P-SUBn(f7Ss0b+3w@x4FE3>BKW9s=qa#mjbA zJIq`EDhgf*Oe?(ku~}cGX|2zQ1w5NeJScS`37kFl5NeCrgJn*W*BnvzL0xZePhD~( zc=tCvq#)}F>^~q&U$s&VdwSF;gD)09?wR?>^Z|Du!8rewl)J^Nst+casVKk)fq9A5 z#(v=bI?G3re-_WXzA1f$QUILH@23gBbQPQrbax@$!Em~3kzDsuG~lR}?xFm?>tK(Z zg@Lrs;`hDDhADnpLzl7xOZ$`Z5UO`4k|OpRFx8(Tqlvtv_xp??8AUKVxDw4ec&Yjr zT)gJX^Mz0ehTh*a#(ZCJfutN&O$T>+aFfHTy{ak(`z`umUt)9>f)9(|W#AY^ zo=L@1oGDfVL8ceVmtQMG=L?aIXK`h?@OY$KAyENLD0?w07dRpbDQm)+q2bthOV%ML zS#PX8bytfjx*jV73MFqF4#929frBxhcAH<2*@rrQqXfv|KSuV&GtFJU-t^BwYWQ5h zx#CSTDgSajCH+mTJ*W*oYnJC738%jOV$1E6h6lu-{8kUug1I_Zw+!JY#B@~uZ2==A zSYmLtt?U;B>G3)Z;(ptR#=cM1&4TLCg6s6tNK*falt$)wD;Yg1A6&EjFyJb@NhP`w zBhroKya`U5`}hJm;zMg;_f-&@j$5?uaIQ38cq0%XqrnT5{i+X@I{n0Cr2O+|V*8N` zD$m-NRuo~Sm~o$8pbmIIT7#4{*g*c-|326Swjpu6e7`u98?Y+;jb-rD7%YsvwIRLp z0K9#W?1Sh}7RWBa_cywP8`kM(FB>%@;6A|M&)|cwuWY zB6UatdhegG3$Gh}&?olZ`~3&H(NLeqkiP0otdW9d*7p1a7Br z{ZX%t2BLGU|8xUXS9hHC(0>YmJ$U4Q=WOSh3tPWeE3k-T??!|hM8R2KCEIt3hXCDB z$eR+)v+zYw<>ju{F)Vo2R{CMoD8`d|jKBPrEf_o^+3%8 z#^~nXyc_=+gS?CX@tI!+f79re25C$otAD-(#h3)_e8`%3ZM^~!&tqh`qm+g1TxH0c z)O?Pl@lbX}6|uvq&$i4Vm2_a^wL@cf!ITE(oxuPQ8L z!tHMAw-1O(F5M`gIS0ulhQrMxP#__18PGS)3hgKxa_0>8!^hM@sX^_L7$su&i1xP) zG8nwgA@#%)d-?6?4RWzisyOT~1cRlSX z_8MoK9XpST`*He$4jB{sbEW>^@Nucq!wtqD_u8Da;nF2|)HF)`d;kjhmiky6QZOVo z&V={a=Tjgj#w}U%8xLT=q2!To@e%P8CjVj_WC|9Z)mklC8v*`npuqUm0-~$Cs;Al} z0-pQ{Iihx|3`5keU1<(WNBF3ow5aIpHV4;Vv!Jt*gni8lTJ=9zKzZ#Yp`)4In1`m3 zx6S=VL|)j5p5l@+xE$x{`tIWi$ST&-lD41(!wj=+x1)2BP3N5yS^7N8EUc|D&CDKQ z9e=YV9!CkZWz0Q`8AV}3%uCrB>jgyylg0i|e9MZ}0VyF9-ahKsarqGltanRm^m;b9>rmnJlTKVEDW zRos}`DH8@QmWu(`e~^J6^`8PSPmLp@d{N=&+uAV3)BM}tNOPxwB|ub%Sq`+O7?SPn zXuy5{6_uOpRUu{F8zmn{OEG%(uZ>~H>@czf9@mcbZpCnm{HVg83^3_UYC^pMBV6|^ zKYrw~@c;KPj{aXi7|~lp^eGUWZ-PHda1jY!7r~Js_y$DomdJwX_g~9zD_ZI3a0&qxF zDl4Jl0P0z`q|_6B(`^{X}k@&fe9V0f0&P*Io%v>ahc4Uv-q;%qd6Mt1G2o@tb<#J2ALi=b(mrBB zYKVM0I<5C0A5$q7$?P7zgp8D0-QAJPMXd!!;zo0nYO) z$l4)o2($XBYuT*rNaB+#x{`DQ-1=FfptGAKc%LyDS@It%m=z12+dDOYJRP(#l~C?O z%=GB*J9ai>`a6wLEM;e)s3Dv7pJ5i5dNtSGL4piDV@sDJLiY<-94svLVdL$!8Q{Gc$9TO(JJkYa{3k*2N7Go8p}gnw`DhYG}5gwcT|=O!}mZp|;T5rVuw z`6 z$B&6lB36}0-6p@ChA{@k!7D8Sfaiton#|?^GEriB+uFGg5hgo{O;LQt#3T(y>UIx+ zmc7%f%WRxLL}`&c>(n&DGVRjQVA+byv(uFa-uRBC4t=OB*<5OF`rvywky{YLz_VK^ z+$_KwYos%_5xM#>5UGSpK)dCRFo1#<^mqv!w6`i#;5#vazZR znU5sjJ>~lg>kq|VElI3k3<6_@!GBd?Jcs1Tal=jQh0~iZJX!`0r-*wQwQ_*vero!> z0SuA+F|N<0T8Fi)MVvbS?FDASr+hx!Sr$GyvFZ_DBnYZrcc0HaG=(I0-*(fJ>c{x5 zth@aB{uz5Nlt_)W3qby|)|7)+_`tg2gwJ=ugTSZGp=-4~0P}d!&g3rj63PE?76~-* z#ke#>^sgB|!-&3HqKA>_2P1lGh&}~^^G)z)2`(bR>moQZ1mA$j-4b~)BBwz3X9-^y z;Wr@eV1zzP=mx|-OUzl<2d0lkZ}%WxBn3Mp+bZAp zZX-LL$E4VDe<3E;0sI^-Kd=i^^!;qD+(5eikV;6hG+betrIJXYKyQ%W&xpG46G^)- zoH}x3785q<+RBQUYz`B0zB*gR1?0V5ns3gFgX}9#rp~LiNVKqQs#mHM;!8X5L?jL( z2Flmnp-CfVE8}ua;J+7&vDEG~YBsF!HUCRW20?bHURm<>`;TelhEaiy5?viaF{Zlk ziRKN${7NyY%8=l-wnHf#Cr-%3&sr;4{`q1DX=R4zKru9Gm?4y(Uhc`#0P`>wKVZre+(1>#9H;E&I zvkNh-k@yDasS@56=hA3MigsNf<-u^N1WE_tUJ*2?Lu2yR^C;LK^-F`~>%ymUE$);2 z2I!sc&2Z|K0H`LfV@}d%@zOYl-7n-qAoT20@BTRku$*_s``?^pM^#=Zg?1p%IDMXS zZ1-Z`O>1^%ul#DB?q7X(isB&rsW@=I+;$yFr8RbW-b(5(tc*2a;Ge@{sXf|Vx{rf| z>>x9@PKK!*#eHZDrvQ9dC>OJz8XV~z*?=*&bSY5ZiO#!>fa^uVzO6|`Ej5Yiks_iI1cY7f} z`BP?Stkg}D+dqx1A>Zbjma37TQ5=Tr!mU`$jaj>!o{dQ3pM4D#(}!WI@13>BKH`8n zn!WLn%?S`Rohp~XP}@8~>Nl_~u1B!r-i0h40f;(7P7=k41U5|5w=Xv^61fSQF2<}x zU{9{o1?o8P;>^~W{BDj?cxJ|nSNq*X@Y*Qwll;F;?C;Ff{7~-#(lh&wS)@x01n4y$ z8D(q6`WZV5u7~VHAEq8(kIfMSZk>7w}1QXr8SB$ z-gXrHQ7HgB)oEKT0T*CWId`GOt{utAzas6V(}XZp&ObU`)Q>G?i}c)GrGTbeX#2M6 zA&|O^%bnpLLAZ~|+_c)cg>b!molJ3S5=)=o5qGm$QZ)a|sB~SG8{+?s%g*Z3fR!Av zfE>m=q}Ew8;Gj(n=43K+^x3am>{RQehB40?M8+UsEb}J^PJVH#HNo^GUQl=mXk8J- z4Y>Eeq)y+!?4A^E4^e%@gz(2VeUcO6QVcg z-Zr*fi*S9Xtq42Ug=LW~s+Tv)Lf(%FQYYK_KuYG|x&Ft!NMmn}SV(#W671*pr1JV0 zCgip=z1PD7{Y2j0xUkCw41Ih`wysbB_h|v&*xiR1v<~?qxmkkzeg5!7vt0q=^tPFU zVdW`C^okRGw?q#k(GN!S))0LP1m~OJ&k|fjg4acGWC*?ik-H`GU_?%V@Xr#yF2Zj> z+`$Name37|eU_NBlJ6*~zuJ69xNWYru#xWfWSaC?&e#)0=4NFDm62VH??&X#ddnO( zs$i4Zah@8*?gTP;a~}q(lxz!hqNjkZMqB?1-4>>7FeoC_`wQ_nGddR~Lx$cz`683~ z95Y;{u&Vp=fEzyAXH3yVdS6o|N?w=gkhn&UPJuoONyzPq;%ed4UaZ44CcsGP12R&P zw(j^q0%Yi9*XivYg;JUIY@hkLz~}1t5tdIWSm}_X;9$ozB3n1&-)H?6i>5X;eNQ%t zYz(LICfM);TLwzrvS;$Jg*x7&bXW{N)~RKi6B7f^ncDBv*_UELC%r{lB%UKDUS~|r z`X(R;e7!2)8K+@g-Lnhnu7$|2{|?2Ek$S}+!Dl@G`fdwNxn~4OJ&fM<&vpLwgE`Wv zT>IBsW6#FP$U*8;5SdJ@=OS^woynJ(b+x(iu*C`bMZRc|N2&aR#Op%id~vOUK05{xXm;ddvs(f6J|IJx7#ve-O=~|Gx4wacT z)vU0UYC&A0oEvBvy*>Bx!dGneo2Q&7x{8$yGkB)Yen5_!XizFJoPriqr^d7`mzy&z zv}l`Re`A*w?b6=7`GYYgRB9h#$U>C&p#!7im5M>KlEzSNAGoQK<@q;G3X&?r15%9d zAzge2Zk_b5K}`CF_5G>}5Z}YHcI6aLu?8l8p+5h0#TG&Kg`uIZ%`e_|>g7M<0X2rl zEjTZ|N4$FkndMmLk@r_9E)@N)#li%Ay_27nA=7~c(eKsR0GDn}@7l~M7$$FU>u|g< zs6rBfed2$L4dsVMXt$CvCl8+WI;<62cd2?op`V12_Q?AU6fW3^510EQjXh%fw@vJm z+F7*zSDX`Xy%y?rihlF`B@NW+J~|A9*O7rwo)x_$Ub-%i0_Tx8V(>pz?nBkB-;h=v zroWe8u;33pnPZPM%fh}z-yW@MD%9ovgv0ruL%3#YS2|<73e?_svv2Pm1uCjWUX?m4 z4G!J+G?;1Ug3CiXPP@vlF^=cYaQy2uHe=8m>GWa-vAZN0`1iFi`1L2|dxr%J46l=W z;jEjF1!nx`G*I;&Yg$w?EHnR#n6dZ&T#q;g`>wuS3jBTmOr=&+m)SCd{Y#4*3^yEjvVaVuXApTdLT)ckcirO$l$;`(iEyO3>zgGzMbk4Z{A4AYYfBpD z>8d-C71E4c=+wETBF_)#ifW!dFH|G@76k5w1q~s!hYhQC z!qybmmuQ?mkJ5mHH)uoM4fnx~`$+Rq5RCC@JaLhtEyJwmLk71j(vgO^>Z;i56^Q7~ zi?@&DMFOH%oannHdKig*Frv4H=u;p#-vob_;35*dE`lRN@C}IEEs+N!atefhmhg2E zegonTM(DGIZb0m_#GJKR>ijZde-N!J$lGM5T0sf|cDTQORRykX>y`e+7hulYqWklI zZ9z#O%{u{)LujgoQ1poH8lpZEeSlre10t7#MItX_lFE++e414yRzF>837}!4-t+eeMv_Bi9qjlCh$XOp`+N%82FYmX_YnA zAPm}C;e%A9=QxCq;<;xkV$J5ylBwB=)R&d218@KgwNYGuokVChPR9fdv2$M#MY2}UH`+=dH8en zzJFXwB-xwnjO@K`=U9aYg)rdteqK`SiTw~OB+aBsq;xPB!gk#OnLu@6^R{_oJLTEeA zCD4()bv=Kx9gb+|fG$-D?(*XXuw_pWrMj&%v|mJGAF`PPL)FmEPITB!U-ncH6|@x(*5*MjhZ$Y+HwD7 z!C@&~FftGp`umk9D5R%)sUg6PvL*^_mu*=AH;E3pDeDU$q;ulFUzH?!k&ao<^H~%e zI;d$I^KljOcG8BO55LrT1HBJA~W3*!A67 zUqlL;x_k=e6k)^fH#JAOT|wZ_{K7b6CHOGwY{#3KZwM#d8q@mt09K@9RZmM(h&8^Q zjCdCz3EPl5?ohYmko>0C@wEr?aA$RW==cadtQ3s0^c(aCTXH2mi;rRe$I-cZfR_RB z3ywGPsA54wido0KJu=_}(_W!?uRbh@H)wgKO&CZFTuk{K}jE39)-_qWB1e8jJ5kG-xQBMiD` zJURJpQ(d4UHNb%NIQStq(kR=>$I?Ox|$$I9_N7#UATt zED9LIo!Bi(hk`4>x58o+4Ke?pSDf(O5*|jv4@P)v2%iGM`6l?Y1Q(IubrBpHf^R_d zZizk^(NiGuvqY|o$QuxMFk(MT>Ep$vV>=A zwt{ZmKMv=tIcEiq|3+S0%qAu7VM4q9$UXKj6^2dg!rBhThR|5j4Xd6OgXh)#vNO1w zk(Q7IUn90^O!W7U+qOoZ5yL-Sz-)mF>I0ssKjyYexffO1qs#n#pCNF0P`_Z`Ed>G|tkrQ#8p5}NQjb?R6rhsQ z>?)`7eZ)p5g%s6IMy4zqb!DgTB2Jr6k53H@Atp0ZiWtXV%04IpaHjf#crhu76SiN<_{aH1zkQtTpUh4Kru8YOxKdJKQcd$e zUKfcgMX!CsJryifnG3ky%Yj^v|2mO+x2X7WO1A{W7YFurwZtjvLHO>>6UFhm{0U8hA zohzW!fJ$UZk+d;Z0CNhRbK?GrbfOC7R(j#;!~MwK0q_o*{?#+CJ3aKxAUxmrfrX{V<(E^Sbwj8tJp+fbo~83T&%}CRa2m6?v&!s5&BIB=Rex7LDEa? z9Fk!uyXro=E2@>%o^<%tUc9;Jbn4L zf~yRL`f5#&CwWDnY4_Ij164(6?UH-{Rml>@!#yCJa`QXlQu3QyXGR~?F=_kpw;6%_ zk9Gog;m2KExDxQj&)jJ5QsTA!qT{R$l<?c#P(1 zb?J7Gr6?axc*P0dE#YCD{(nCh;jJNj3Iyky;Lj3VM1t2vaAXL+0nxi9`d~y)fymDi zxh^7aK-|HI{VcIJAl|dYoNc|~@pR|0Jo;t6VFwjqN1a}djHG6>;n#f*giF6Z0ZshZ zHN0Mm0d9#C-tm=cXjQ)so8tv0^aCe7*$@pQPIq>{oeAM@3te__|#bS{4T?hgM>3)^wdm;&MN zH8)OtFaQ)?I+H=E2y}Wc7+J2ghP>YQX(f2S^ZO^-n(8Mr8JL+mVE( zTCCd9_e^f{as2!_?232UkgG!6@fTD9N*GZgjoQm%t4~ajEzW$)|zRbuPcr=IZNZ9z`i!<|$ z>^Xeslq~<^4d4nCh8~HLdbhYcX;n<>gW*EQ?OV13Env{wrG8R=7C&itO~(6_Jy0?8 z_-x`X2B^s`Z2#TC&IfA-{@c%Td5sT4(%wL1>Pb|KgbVD-F0}lNtTyaqp5HqTrBFw4 z3yb6tCRDSfrvAVJhSBp~UH!nQ2lwSXsBP_&0Y5t^eHt8ci?EJ5z}KS;Q{) zO@}bDbz$wj={oLDtQ;B84x|T;dm)vw*lRglhTc^B42!-#<`Av zL>7&ts7$QN5aT5$ik{Yd3d`0Md%cfM2y{Etcz zWqX+d-m3WgoB%5e$~U-+o?NyEjl2WMQM*{g;`OJv$%k@i({yps`50|@%F21U-Cz;v zdU&C3ai1hQh#ffqi*&&6`k8h|Ef&<5;+_|qr!~AHWvoYTX$Vg46%X)cSVdUBDQ4<~ zjU%r7OKOQ(e=%pBAL$tnjKR)^@xfGId61zN9C&|}3T^bO)KA8fF|ivRTk(R7c)~sJ z(+-MKP$x%PJwwkJs>OMdQ7@^23v*%bc#}RMecnp%`_4^a4qF#D^~3uy^Hz2*jqq`# zem&(?OOpid*yc0Z$)SUndC1e6)f~rP(|qoI5J zNX{r_2?kD$6 zsl~`WPV`e;oI<{8h7t5^_V?UOn9+kI7*(ICJ}NZz zOfBV}5=wZ*3EwT@VI=%ugtvz9DG;1*f8_vZ{|?`|x!2u8}G#@Hc-~$lMo*>bzql^%QWac2Pa&O%;&g`q_^)0|njzN)rF_ z2JW{yh5p^akn`8K&yn`Ck3PoU8%{HUe|;`Zg(g{oG{+`|e{=RE$Mzyqy;PN2;8Ve;?DU=76}Yu-qQhu`P} z)(2os>)E3EqOy#SvCeV0=6>ZJ=C_V*ciC`NKb}D*zuel@m;R2`)|$9B^ZY=(g`1o6 z-ykq0s^wwdOH)u3-Ja7FWC(O)<@r`zL_lm7qGlV?jns%EzeGGLFz47ru9F&fuu~+$ zN^SgOq;a}vBq5;;OS{xH(^930OIf`?nLKNPv+O&a!PlXSGiVm+pBZPygPQbG`24BS zm{x@dvItpNeEqXSYTyvYER$=Ec=Dq?K7T#!)I8zupKtSm7uu0gR#D5whiqucg~;r? zjX^ND!|l;#5g97*`~?Y3aTE$~kM0J|WyivPvD_7qQuA z&FPwjr$F$v;-;|>IruT``-Z5{I&zo2t2x8s14eryNI?AaCgvBSc})7CEUXQ9^g?`9 z4(dv)M>x%p_No$9E>C)^kTxdmHz9XEVWAH;{8HVDF*A8LfzZW1q}15C+9Xg5KkL&t z6Ub?7E%zY7psu)S119^zTN~sgALIpT{I!AB=ra(S=mD%Nc^W4*0_S%_8ve1ahXW zY}E9|B$9o#yO>pY14HI`e)n)`LZi)wyN3p}fQ!J{NEl?*L= za3kxf8Xx+cGkh&MK^{_332^?U(S^&iamI%J<}h1WD>z;48^Xsh>Ob9n8ENp{jfzT| z$1)1ee&hSLikP-)dB2;p0u%3^TvM>qf?C?y+jGze3^aWEnq9MqMCrcK`|Gxe2)_+s zCj0vn^C{QodS|hVeC}K5bib+xd`{wL#JsE^U(46ocv&;(;}&lCA=Lnq&cHb7Xw(Piv8Uw_X9CI%aMw<3&4B*W+aqVV+_m zl=<@)OSSjZ%u-Xrz2*FE)9xffe{BJsXgW%KU%VtFaS`#23!0Q>cavZ_c~Tq~B{Tlj zbBl(=H^7D2SDEe|y#XWNoPTdv(2Irj1}~gGD+7mb&BjZfH-Ma`fm9zzxvu+8F(b63 zyg?MJD_I8V4(29N?GN$VzBPw29Jq{gJf_2Q)*fWnO!Xn!_ATntHVhD$vuhfk(*&t| zL_I!zvV>VQt80CFO<36Uvh6@lH_|(H`+%b7OT?&FLWFIw1r{PN@uq_+S z7fx#lSC1VAl|vdJ)Xm+HXO9?=G<=hLEus-wsi{g58y?2ep0!JyD|(F4IKAr)J9Q1a z@+4#3%qSMQG1J*I|Jnu>Z;L8@3p`NdX!r%U6jxNar+#~X05ht8q9mmK4;QX0RD3X( zmk;JT7fHXfp+?`HxZ)9Mp@7qV*)ze~Y5_{Ayb~vW3ZXI)iTodnj^e6wGKu`F-XJ{6 zv6tp9KT4H*e~PBZ5X5^r=`R|{!<671mL|_>Y$vU$Iph$@cR*i<s98QzP(eAj z`63OiG|?FGT{@O-Cj8ocDMWGL6XIMkDaCL`2ejAyI47D%$^(rlvXP&qM*V|MIV<-n zz({s%rHQ&0n|x0xt@V-;%}kOmi!ii?WTt?UPWBGs`2F|ecpGqgD%Mg|&o>v4WVM4*Va?&+M68O-a#S(&Q)?~qpx z&JTYTO(Erp$xB(bhe6{N+r9m-mK!49aP#)+(L-0&yd{CPVr)_MNl{zhD-8KSVWe(b zgEaCnkU4R*BckIJLXYfDkT!e#Rt;eqP%tlCSx&+coGOm`?V)Z6`3~8ET6PPlxvYXs zCNToSD^B=s2@fOT2P3>SginFsd=vaxf{RG-x(JR8!8ag!w?rR|=qV8SSt8d(HF#t~hs*X!2z-ecP;vEKw$#t=74maD@*B%toADIPN&Aviw3MEWibAnTIyn{jMg z{W|rJlj6b4$lA>rWwX&K43&`Eci3(ZD!g?n;PzKBpk~b#PyN^ohXOxbc$;B7YRd%5 z^xh4!H46Z)+j5zIc;k|lWui8+`xzV3Ri!|LZAykSi5ejbH<(ls=0aGjBIsdPnV1oGuI3 zAG`l|2P+08KWiP23=w+&%6NNu(1 z6#}|erB#>8*BU!P zZ7@B}4+xgGh_-u0@t2K6Wd$B zf8(L!5w2sfC)&?km!cnIe$hAnKFb9BcDMCuSXKqevk`R8?lBx{8Y*dyDWqv1oAaK2zn!pKS$8u5@$yrHykzhSUwaXeWDbcA^R*r0$_3!0g^Dm&~2uy#t7=s-`!3PFo%qdkw2+8 zVqulN);{3}j_`fjO$9Q^M2M)WBq&N;g;@=B@vb)>fWZ$bpLke{LDHk`-5Q(x|9Qm; z-!0)`B>Z56w}$X35S(vQtBG*Oa4Tw7!v7aUO z2E==on6un$%~^wkoT!np>U6yJS>WaBe=eQN2=04LCEa<#0+_GWtDR44Lq4uvXk(gw zi7`LjE9OHxiYV=QB3Umb3DWK~d=;sPqddW&Spf_>R0t zr+ra?2?rj1Z+wUv{qxxO;yp=kV3D)%@ei3E_@favS#aMPMvm?E71_TJIt0G3>oCeg zE_wOMibUT<(#OjBKCnDQWRqiACz8J-aa2DJMr2FDed;@8WJWUJ%%ax0k}O`}knqq! z;gAN9B0o%_ep(Rj#{R8wJ$MtQZ}%tLe+__*8s6cP{mGCq$CN6gEe`5w(jWLx_ZV#L z%m~mVkznj*ZPC@)WuQ{4dn7jK3^YZL|MT6V^@(<2=TMTzTJa;l}*23My9`~0$l3pwt#$afn+DSGI?IXlvPd=tCKfFGbL{4!h3pk2 z@z=`E0GqA9mYZiL5O8^IAtt^Xd)tFkwtk#MT2&t^8B5c^>lBC2@b+>*{kfdo&<9FD zm@TDIf=Lb1XVcY%7plrB5z=NK~WiwGoT{a?A_kN3$W28;(4d^C+yXb6qeZY2H`3m4!d+;50>zL z@NY8Ig?X~SjVDFzm2| zs*x_mGcQ|_+Mopaw8i&W1U-)x>z#*4b4&Fr+B|J=z2=JS?Jwpq{!E<1)`Se?EZ%>F zxq}ZZQFq79C`N$Z88(gRlUIS4b3GnXk`2?FI;ritfsVtl+%6i?{_ZS?YCT*K@ChP$ ztgnwxo#R_VehTtmIKVQD4MkWUxs!Pu4u~*5y8c8LI1H9FvKU!{@JuEjjQSN|3$6KCJb&#vKq-!>OW9!>sFE1d4y6_)nlmtKQnRXo z#ssav)d?xIt(@;8R5rtd^6WW`WQ}kcLxcPvaU0x4>Jd}+wjNG+#R=an;bA2FV1&1Z z@F@_SZ-PHda1jY!7r~Js_y$Dpmgs{KJq02^OXRwUya90sBlfez-hg<|5_7guP=bH` zoDRCbPBTNw`v{u6`uDT86eI5YUUD`x&;~O7_-p7EYXZ7Hab7TVcLK&N^{X2`WT@;1 zCj8mj0JavX7M>!#j5NNfo+>*zhOtz6us_nYg=g4!scfFx+U2qtDZp!}nSbZ1CFBTc@vwn&2oKOmP zC@n^!Xv-OXGu9%SMWHS&UJ*#-{x@Q4>K6F&Uh0pjZ=La8c^Pp}R$Ki05&mjD6BL(T zZJfL9$ccgsxoe{SX7FO8>5I>#-@|7I%9~DaD4>8+yi)N@JakNoras-qfigvOqoSMA z=;7c7wJVdEu=DIS;i}=KfpdIYzfc6FPJ#Ju{zwX`KiGBDL z+U=B^3nKVvXi>91i~!ze4Q_B{^W%C4r;=>rj^OwDZoi}HB1zMjROi;_C2@b*TW?SD zDnQ*2o~h=h*q-GUAb!T{#XrCx1UiXka`TOaB}3czI+1haWWkV8sSginFsd=vaxf{RG-x(JR8!8ag!w?rR|=qV8SSt8d(|uGhbIWCnTc(A99L zxE?{zADM4@{0ceh_p6zG^dWrZqHyZD9pq(H-*0?2{bm zAGOBJYCa+Hk7h~=W_l6UNUi$IhhHFx?bTWINF7oVuxH;3w|f{LPszsxRx$ie?6}jP zh#kzZsKe|05gB|%Zl!9njT09p(~7Fv0O&qm#`JbKOZ1VZ$G-F;eKd=TzW#HL9ok!% zc__~GIi%0KKNHYpg_Av+$}qWQi^izZjPH$q3cY%$$Vh%Ld~&?@6{tOd-hD6B+!y^6 zrmp)ZkvQLYIXacy9VU#bh&(8G%Tow1yq%s!ZW)X zQtx(%#p;J5sSlRuaqe9^si#oHHvNOIngv!A^|IIbo`OkXU!f!^Z&0?PlG{wWgT;h% zTwo{dXV+a_uCdLt0Pd;&(DLlFaDtJV^WU81%Q}>-q_qbx?@4evR7#H@mAUdFv`r1( z`TX_WJ|i1gI331WBgO|8kE9<>nZYq2B5T4J^9dQ~_TA8iU$M+%zXD34zap>aX>Z0j zSOC|#Z0_ot8X(7mtG1xd0Nh>G_FKLE7n86KN^5;jfnK$Ejoz_Y#>4>gGs#Q8kmeJ6 z(?QrVxP7KHmq*6{e8|gBEp2rKWR?03d)Vb+9_~k}ab*CxUX@?jS3QW?Fc&Pcg_L6v zVLHK76;F|f?xr+lP;JRuM9r7sGuM&B&x|(~w{31%TG4oX+9qK|9|Ip?6)rVLMgF zyS7K-!S`*7>=g~7r~KzIKDPNejgXJn z^i0245$SVB?P#Wq*SaAn=x`kEWhO^|EhsrM4DH3ELWJfAX?|jU#v}bo3LIduvfiNV zt04@x&2a0e(FU~79F(#Qnz7RT8U{{3Nbim2X1a%(pON4jg_mthJ|M5=jQf6YYU8Ze zJt>5)S>Rbg%sdsFMz~5$dM!32j_cR_7%?F6bN6yR36%-afcIv#E7wBDkdzJDo+s

EgG=%HqzziCJCtoC0&u!(kgHXE%Th z*fE8bWlbVd-TY2vU4Jke_BH3JPZscum=)q+VF?_6YTyB_0zj)N@`77@CnkxF`TM?p zg+#4{3!2CEBWXvPB{bi3V4RM>_ACdI@+Zvl6n0GXxNYsFyqELFI8VLF%PXP)|1h@} zgpHe_YS$jin7y?{qjf@tS^eelOx-gt4oA47wwHENjxcwDB9Cu^Le*+G;T0!*w}gk0 z@PiTFn*aF}{-5(r@Mj4wBEjn-I5Gs^fau*4eK4Y@K;&nMTo;iyAnstqewNr95bs%H z&N}8LC?9yjj_T3y?=Q|!2b{uh0-m4Hf#i%g4>i@3a)=Q@M@l>T5r3~!HfQRKkul0} zxpuc7n1jbtcFNM9h(LYw;p=jzLE(-e58E>5bRljXNx0i~!f#d!^bOVa`m!0qJ4NR?T@*E;{HXdBD+^;FL$;gb zN}q($gq_jc+uVf=iTNKGYkY*f?Fg5UN~yziRzw(3%{Ry|8?{y~60uW`R@6U7@DCRA zaM~Znh~TyMv?L!E3$D5GHo4DM39WotkN?pt2LCN~aKfo%|JGSaXuwiTs zY@wPEf3zlpPM-TSc315PdaUI0i(K{^xb2vTCdWym{;$-duR9B(8KQ;P+%DHZGJlHw zFNdVjvAglt$aiRPTDshgpR46?VN~zp5nDb~jfckr(24^xUaS1sGp5i;zL2Je#5Y({ zkReMW^=^wgSH!w$vygA$c_mV#ZlF8-PkpYhACwbUH(J;1!b-JbjLwnr2K%>svPwvI zFm=bcS5IHN16kQ%dv3)LC|Ozm>pkf`+tL0o{@m;3NL| zj4BX)-)fvtX8_3xcG8t96p^C#qf0s~ZOEiXVQG^@1tPI->^}H28#6pxyO`=_35y3@ zMID|W2Re?$GHI-i;P)4oe!a|2gx;ylZG(mkbu~z})1I%wuB#6O*&MG#9GWA!#eNw8 z)}~}5@kgf6+NtH_D6=3ySej_-^!cHf;dwE7))$Bx*^|$%H$P$JDj!RizdXd6b@VPh zQpv!!S|=S>z2lHLvtW()UbCb9=PvZE*T92Rr)Ycai1k(=7Nd zr~k_HXm^m}MKO=*kfTBaV1F>9B!1Ur?C9)~8{kxxSAo)PE>t*WQ(8wx0N4GNu`B*C z1L&3?2B)Fdos0!yiVby zY7n^yi&9`)@>y%}UWAQlckLb4_WO}n*T!EgJ?R#H=iCM|y^&Zft7ia8Zm?6_F42So z^{3ex=4#agnOcOpvU;#C{rL3d`V`EGOmIWw$5%we7+CY#^{NJapi^Ff#QU(QdD2{^`1cY(ym4_qSh%LJtIcqOErkA(cs*CLU?hzWFXWJ>6JC z^Ubf%?=il?+TVU>6|$;FT6)i8yms%9O3#v;kB^^%GQ;ZmBjN|(Z{tAzV9i-> z>!}x2k8$J`UD)2r={HDbX17b8cZlv+k_p%&^!5s4cqABK$^R68-4CQo@ti&p zaSMKVoQ0a^-Y4yi-c;c6p|Hu-!|x`0Fg%tI$wv%T(bc&z&!Z56w}$X35S(v!&!+rBCgSn&aexyg9TfIo zXDBHogZJg0(l=%-s6S~E?V*pIQ_l%b9cZ|>tNz(MMK8C_4wCtf`g0d3!|=kc`5S2` zp>hR{s*0R7)V?s7DLZlj{CqF}QDh|m)Sb2da@sBgk^{>p7th5)zHi0B)~9IjkiL5! z6)&Oz{bvvF_PsIC=LyGuUh&tp{EOo|(ZG*cv2H*p95PtY|K|sbW_{tcQ6C04{bj9u^wMvbm(0 z&rG~Qf>bOV6}}&Z_93#$`WHxhSw`mE!Prf8=fB5a`k2%pWVZ~$0#%-3YQ@#;m2Q_Y zK9N!6f%~t98=EC%7l-I!+*gi#q?H*e##CCVDRnoT5nz5TP#*#)g490H(#3-Kr-F>; zCvSlOp5!I}32*Rvb`?1L!m7h{MhK~R1^#%+fWC`&TR*;hiPfeonmg3p!uBYi%t@`5g#m5}5e5EY@Ed#J z=?B`Xp#R!ixfZ2#tWK9qyY+=4GROF(;7-?Net2jTW{9y2WoU{_oH_hL6vD!V*fZj zyxM-`=QaIu?4jT1r=ODx5W^Dk{1?KF*cG7qc)5lK-f9_klCKg4<8zf{UJaCByW)jp z_GtmQUGd|DuF?f;mS1`Kdb|rVWa(axAG5+7H&y1&nZ#ktJRw&S6#CWE&hBVu-2AJ4 zRqJ88V|-&n+wZUTU#dT;XBM%G(>VNXxOm)Jn2I$Lb{dKwe7j%?#k@DD%59>d6O44Z zdOZ+w);ZUYGp2)#SA)JepYwreoXk^s#cU9BQw5_roerY%U+zjA7)NpnlljZyFM=!w zpEKNO8YrlECYnm(Vb?d^zi-hV49K~+y*D;)k+`Vwmj67AUQUlVZl1aX-hboYS8g5( zMujr}^C?i(KVUp?+822BOI4Hjvl#DA-#;!QLVM^^9H4$HX* zt}6Yc15)M%)03Ga>e|1AC}k-apu^$Ao(o*eKrrJ?l=VwD>}t7x34S{P)2OiC;jXX8 z*4e+*IBe{J8Jwm*v3$~CjfOFsn$*~fctRonhUZVifV*7a!C4t>EVRwz+PPCm|MU2t z7l$p7*$VF@wuHNg=CtV1?@w>5&l`(hrps6S~Xs{-~j zcm0pClpq|U8t;+^?_gZapYJ$Muz-$De_j(NPGC1DU!db52!G*BA%8U9AzqY@`9UO( z8gtx>v(y)k0GG2@hgSu-AmS!6J-mnXd%UTVS7n-ym2%70l3lusgnL+g31xFf`hxEU z9b#Wo_c;CIchc^vdR_D$xsZ}y>J5>85gdsWAkTn?Z%M!lnoFe0-Aayx-&xxaT%S*X zJ7bjD_g8eFi_X+K?}u0F%T)*m_LT%!Ek#;#B{{*uY5jXCs3D|1r+AxVF$dYC>3f&* z+Ym5jA{X8}+e0hs-Vf`WIC99FVN8U|6a@cfZX1pDMOLKV-}6(_$F_G8E;~85He4C7 zFRFgV4zx`Lne{taVO2gp;NqWw#7|rb@iK@<#CU$G*t+kBOK(KzJL6=*rj^_#D;qx) zVxG_yI~Rm31?;u=yyAj%i5;n%zwD01{m>Vno@IgV66#t?mg^0xu2-mAP!5phTh{r) z;UGwBs2ECqdK>i4ACKE>a0V1mlO<45WRU(2Uz+Q4-T)m;LR8Djg^+2?E#g>DCA>&+ zeusf77cMDR3dTfSgy%zRxtagV&J?2A7n^r zo2Go+ge)%WG+d>x!t_&?WU?<0Aot(-B#WO@0-Nf$4*GaV06F(xG98rCAc}5xN^<%L zWDm^{Bjr_*#HIT^P86QlqkA|F1m}^0pC%Jaip9v~M}M9R-Ck06ng4k$e zacfOobg8$P?HoP08tQd_zWrN6_(q5P;x9+&b*giY&kTiq(l@f?qg`MtE1jlcqAlFV zOLs?ZBnU_<>@_@Cd=1b%%A9O6I}Lb7)wG6R1c0OY8;md2_)#zCrA?2gBwp?;*JI^x z0pOD+driwaAKK^Lsg{_$tAOF|1jhYhGeuD^zaJR2jJ`m+5+@dPVOjq3f! zH^59Nd;j%rF^{jSW#7_eK%8a_Z>c07V7MIKBF=pjPQloJxh@QOXG%rN8(>EdwB)eV z8iPts)t~oTjDhz;P0CvNaTx7X{oi}G5bfRbZ_aw1xSWbbmm;KZW)lcyf5W!7;+_j4 z)9MW-qt|ZUpaoVVY9@6TCmZTtQFkg8mm?;ZRBEVIuVL2ucv@LOKVmGv*k|l{5Ev&2 zn;y6=0Qr`)uYQh}hx(l(p$}sO05bQzx?1l%mTGtVW${~A>~B}0*>vGKY_wP4q3H8+ zjMg#OYa;$*!{%NE*_wpm2K0bJ)a8r~_4YSUYLCb+HPFC0+LCZ{w%IN$=31-#2#<(Xdx}D_-W1W8fe~b1K9xM1CWzOf%j?He8^3OZNP3feMT}l+vafJAx_LYp1dWHY2lp3wiwPl)+8@ z;(J~)3P7hR@cSzj1kx~ZdJYA=KwR?VdoJE8!;Ze^-Cxh{gZ(}7;l6(&IT-j_6Gavz z4sVB1hYz~3K$@tnL$fLzz?dWW?a;IfTn|uSVff$=8#zYU6U>C5gKfa@LWL6W5NDhD zd5j0H&4xrKUaCc`48|zdo0l4d)@vMfzbZp%wf(CX%|0Q>Ppa{5eI*#xEY+{zVZD>dQa&i|*+E zyyArKmhdnVelWsYL--U3&NsoICAf$LuZ!Tw5PSoocT4oah@JwGpCxi#MBae7gAw~# zVsAjaXNfsm;8DHle!mszDsyAhZM%ojo3`JMtTjTUwQqh7w0MEMVf%O_(0WI`%$+6T z;pNQ+MvKFGjx>MOc{TbE-99^uB%V#eZ<|#i4sNHtBc&noP^GDSY`YpE7npAH%oByn z)CLSq@E}yPKQQP`z0z=vE|bo6X+Lo0ncRKP}ZQsD{P1eWkBsFZ?(6A!(bbVS{h3eUOTHrQb)fYK&y zB9!*>=vXmNr%lTS!p^n9dnJ1doAHUJsNRVJ@0sk)h9Ar0K??IGX^G^xH>*+p`Iaak z^Jx3T>~RHrY+=}hbzvPFA9_GPAQT0XE$Gt!`EC=i_q*pw9!6#taW}*7L7-e(3OFgNlfDFITM$f;KgAdKb_6>1x0~yQcR8rmmv75V_ zOuB<1WQF+!R4){<{!9EZ^7nVtOP&?fk0dceEBET3?z=ncAu=Y(_A#%qz(@gSot|PO z+K?kmFX91K5=MsHN=?TERo{Gi^5-x}^fbmqpR&N8U1d)jE|LM(&%32x-cMuTazpND z<_F|gHtWtKr6x>fZ|WB)ABk;dlC7lNSXQ@Gq8j-Am=!e7PaZ22J_1IPR~*fv7u6#S z)35{_(Wv=Ys}?lqmX1{hkCl}xnI9``VeOPbVEl%55{xp zWc}nF09TK6+&&Vug1GQ=9ZC^;ggFNH>=_NzgB0tmw*JiPNHkX-T@3rO`r@}s!SBGj zdR%yCgAY>~BHGyUAm&mFcJoO4+s=Rz#4YLR{6QeaRO^U#ePz?^g9;4{XBTTGZ}8o{Nlik z$QbOwY=KZ%qyXqq7dmo+tOIK`37>wY{0TcTGUbpPCfPn?}j_dQZSv%IVlHT(>j|0DQrHlhM!uIlO# zw7ZUZw5A=?EN27pQ+*LZm%1C$^WD@a#LGXe#^As!~?u4%3Z@OhU9tEv?oIQcD41k2dK_$l3;Ll`kH;?pjBWr@I=sjwk*^WJ|s>ZZ}23{eS+Cr}OZq`hEX6*&~GP zY?5SUpWFLbA(gC3C?iS8$|#l0l2P`m>=a5?#JSyvLPkS&NQzKoCE5Ls-{brK{1N9m z_xrkDujkX46Fatx9MWD`FS_sxBKz&YPIe-`H= z;=C@LBZKn|@ZK%n2g7>`czzbob>Vpf{0@fiXYsuO{+`9>Z2GOIR@~PUF@4pEFeTd% zMCBgzTxNwIExp*c;T6pR`E6IaCB2pivpMVuxAphH#6|aaU2IM0@aL+wCRZ}%()!I? z+Kay1zC?{;&w?f= zf1%#dSL=gqpAc=rBPn9h1{Q9da)6)G%aM3eXK|ej%H5*w#;d~?C=zcE>e-_P) zeM_Wp5z#Xi_umA2T2N$FF89+`61;5>R;Ta{5QmVjIi+`tEUG1KDSa?hp+Voan@9x| zuHI#IAQ46l7 zh2y)S)ApPN=Zo%03fQ2+W&bqB)Nu65>S2rEl{~Cwm+Z9|xI~hh@Dn&9vPNk7GeKW# zEC9#t_%}Y5wPMw|Lk?t-7udjP*0B-cB#hamtdv*r5f+vF_WPd3*Ch5vGB;iOJgFi0 z&F?100zyIg)8X=NIUr>ay94LcIDXUcKh@KGME#5J@JyCF zk}fy!ta;Lf%^V1#7VI3unm)G|GG&Zn&sv7R_b`lMXQlYk=Zd0HaQO#SwJ}kcs{i^b z&*Ll<{358--mwM^@7A&*eme+dUcQ~O7OX(R^76ip%jM{IJ>x|=c_Sz;S9-corx}gv zJeAhUp$4^oKdi_me<5vl-lgLp-$pAjU()D70H!4>>?2(7gOyH|a_y{>0-EyOVeQc@ zP-atKeGMg{GMS(Grt{vImB4codAcWklVZ|C$apIAezZ; zth^Y2G(_Fm>NGMbE9H@%`V^R;zZBqnSBQGnMTm zCpQh>uCDut*;ri}ze(W}>(IEzU1<17So!)r<@HcKR@nUM^j*JhboXjWnS~ew4D3<3 zamML0R`o&fVfe*2XzGvOy^7!^l8^kaHMKjV81T`3yz9>-so3a#K%-JGpwJB=h(MfgKwlG3@To-M$(}Ki|5S@^li&iu(Dm2ho!19QZZ8Zq63$1UW$&~PcDk7;Z0OE?CwdCE z$N$dF!$$y;lI>lL`$SEod->tmo#!K%|H)HTXC|45%aLcd&S$Sg4w}{1wJJ_|6R^kWq8%58^IjBes`0 zMzrH+6N(g`J(MLS3LSoJ-0}Fq3p7f5<1dMD1Nw)|ZSw4t-#O2IlB;As#Ob+5E`bZA9XR*VAsJEgkyBIi;UK}8&-t5AOVLtFLdF3_5r-0=(`|YQ2zA@$kzQ32t z3Q%#|KxW-Fet6-~Bdu?jQ~mCqSx!wzwPmyHwP6}kvF&cIYdDKMCS!|t9OZ%SlcAyllHwrx7(ahj z+$v$F)J%ZG{0HH2g3RB2jhWb{Pv{@#5KIv&`ib+t$A%dVJ%)9R1xlM9mo2Qw{}F|5QZoCy%xq_ zwWB#3Mr4$}i7B@omTa*zf;5IE7d(rwT6e( z9kBmKG8>&`*84m{sO;WNCoxomh5zWv6K8#mohr)Z-SX)`(1PpF?Rrg6!Te}q@*NN4 z3O`fa{<$98c%EE)uB-_&O5wYE=fFqwVDuRE&)J`Zo{&`C z&xN_z$FMzH)M?4c;PjctKkh$4)79tyzI*co>l5Z&lf3a89c}gP)+ErAi*`Ld>35!q z9JuK>;&9#WeuOf+JGInQ;dr^>duno&01 zuh3&c(A|snG*I~Ou6Or3Xegd5R~k>}?}UmK`lD8bme}}l(*zE~Y-}Y>;_b6L{^;0O z+YB3vbY#6?0HqTDqT=3$-*J__D5AMc^NRyD`3%>zy@ksXYCJi@WSF#@2$|H0qx$^B zf6rz6c+W|Yd6SC%1!iEx>4fukH;Wx%1NF012ZerguYQ!hW5 zMB9d5?Q8*(Pt=JgzW<4Gld^#teREbyH`g$ZIQx<5F)vmrMIrMtJv_8WWykt(19qg+R3V7216!z~zIc0QEgDR)9}4Jv zi^nneRVU99J75|3|G)y}*#!{PNtP&E6zal8!#i_`f z!uB2n%GJ1e0Jw=Q-2^7+81w9(eoQM{n9+6 zZNp0LuAS1#4GV;THKu{u@efk5RK$+{q$B$ta(G=L5diJlv55}p4a6) zSdQjFR%swY^GlGL^4&w2Q!*Pw!btNvbvX}$rnCqOm ziG`x3E#-4*Og-4?152t08J?nLd#RdcED7^#cVj=t$pbFE1s1lYgYU~R;i4>LNPp0@(O34B=8$6;hcKY}U|Z+g=sXDkfxf;B$xA19LModa(~A?w z(Y`ygZ`8F1kZkM6LpA-KSWu3i=DZy<^e~k54h-BP6g-U;6B+zVC_h%ub=b8SJt$*0 zmC8&+)8Rv6kr`Fkot@P=MXAN8PPdFcN&h#t!dF%2{hEmwWTbs6bDHAc{%+RA^m`I@ z@>p_?Kgz)#jL6()D%&L3^?Bt>N~9zIf|K`LBZja-Bf@6VQw3-yys;^9vK*bfFzUKt zIf&^s^XC0MNQ4c~NS8MAYEeO_&yRCon1P0DNm$e8E`lkal~amO3>xJnhB4nsL~14n z9OhXHuvGe-_Zk<);DDppUA@jt0@atNs=sThQOL0Y7aF55%z1GwDMGvk8x5FBa7`E| zF+Wl|>Ta+~AYOe0+GxI#stW_HBJ=zZ1-Bw(-5rUFO-BlJ_J?DCwLG_^^i#1LTGVYH zZu-FRFa7VbSdPG}mqd<4+4)0<6Fa0o3txnLtmmQyv~K~bGY{VIolF8Aiq&_+W3GYj z`kSYYnx+D^cara)?J*&KbY45j7ZnIvIz4X6mZwtk4NeCllMfSP&mOS&oEQX#?4ENUHJ_Vff zjq_)5E+WqB!Z|WH-vIC3;(ai@r-0{Y@mv?4H^A>;_3$jgmMYZ0*_Xx1qex%2g|UrdHCh zrkS4|{)^$L@UeU?K{W^ykaTM=w=ctnv%in+_|r$CWxfCK&U*$hb?WrivRx%~M=KwgVVU18zZnHY>UT)_1=)gj%ejs>C*y&J_*;dWe%C?MCyTd^J941M zX@;=wk5AzhPy5m7?*)*%w4{PC>jdh>7Q@Khhh0=%xZpB zWHyJgPl~F{rTA{K8u7wliiZ)wKFyKc;YtA6m>{#4Twue(`k#@fC=96jV@TnAV~4q4 zYf<>Kh|oKIMYZG@2q82w1hq|)IO9Wq%-I{m@h49bD0~C#M2*^Z?-mJPALh*6as&s? zm1ZBZa{}cS7CgnHXW=;Q5z*~jSI8n`>UQ3M?rrJ+-@%yqxI&$$s37vcRBl$ZKrres zY&B~X1v2X+Pr08h6Z&dHcIgbJW3ZN;P596=%%`|v?bh|Lm{EtNiV3C+K7{6}^%h7& z&go;9)I1JM(F#KzuNRlK%>vQh4(`(*49KYHXSC(O z^GQOS!k6Y3r zZiL#!489Iw)rr))iT-^!8KH4T4@Evb)PM917cec-CEr<*0y@F8q0eH*NOGjcPm#*y zNNs9sLWrdj^*>{smEGilBHHActZ)hFeag~cUG5v9BI7NWRY)STDxf=&N`DuTxF7Kk%2ot4y$hmv2F;xyWI z$PeyM#9D-g!0hq#ImXg^@bFjmr7=$(^3(4z+J9jPwAt+*9zga`_ENHPm|kVz?qX(8l|g_}nFkd&P0zE$(5&{b0Da2KOo8oNt^z`yUta zf4nZ7BZKn|@ZK%n2g7>`czzbob>Vpf{0@fiXYsuO{+`9>Y(SBFnb~wBmfrn-*O~Gz zbl~Ty`urRTC@`Kw%}n91WQycE-7(^X70YQF^5>Ocd3jF7)sef%Bcb_N%&!ovmqS!l zc*+GG<2+5F>_wx}eS@|7K69j#s!J!jYF0_kFIFvID{YbFUR~zPfBu9t6lHx-<1Yuq zet;KOhjhUeBD_Xk6aYFBv#RI4-`o34nt-FE{OW)g0fv~ezJBt>7z+H$*lENc z4s2@23d3}-Lh{ zqj%ePi_v^$O_UgCgHcjlB#W?VLc5MiNNy(&Ff6=wkm6H7)0S5pDV%T2Nh95#r(Yf( zoG9GcqN4`Bhm+KNKdJ&p2RkB#BZIEKF5c!FV2QCS3f}_F;fpc#~xIv8Z3CEH=3a|S9iPRb?IT%aLxb<8?2O5=~aOCA%MvTud`Ezby(7r{v^T9ZU z%O=izoKV9+ZkgX`b$DP63TYN@?P6)h{_UcU_46u63TLAiny&mHu)KM<^R5X$Safl8 zCetjDRxX$LFH%opf)-u-_!e4`-LGra9OAS9J|213`9l@{>>As+c$^lng2&^34kEOy z5_W^5qk`f${c}%@D-ttxD71QC#7&uxKia;hivz`9CdCWcRN(hI6V>I4Z-gxGmzf{W zI|6RH692S+?jW^Io;ratN6K2B%JRC~LE#w#FQam4FvC`{ar*+r&tW<1!Mk4q>UiFD z^i2Ho=;s(KM#kq>$n(g6|U6x*O@sIkj&l1JA(_374{f;3}&z7OYAM-1*Wr0>Lq^nsUy0A}Fu|G~`JV5)Fr30FH zlz(i)ZjgCAV`OM#Kb)wjE4%ul7+aaWCF9$Q6}NYhZe4@l z`&a7>Ph^1O9Uoh!Ssw!zwiDld-15LEyjSv5K^wm3XwMb>YE9$|4p;gHqu|Pi%5fv{ zGhiv9j{e9+Yhu8Hp3>~`aOfAHQ_M5r3j~5D#Rd~j5;u%?&sfDo!{{c`MDPhW5Eg0B zK5VT=?9jFjxY%I~kFubRL=V$R;7oIo3?_l_T7T+7-?^%4#HZVNu{_>5&c{HosSSqvzs!eBd zUwG(&03q8`k13pcOkHa~hng;ksGZoddR2&JP>q~>kaih;V}5=jpfLpt&^>hLjbsnT zZf+@>)GPqYqy8SNa}omrA4{)LZq7hkXpNeqNd}xmcE({pH~{lbXC7<-q&54;}I z{(Mf(ig-dHY;TrO7>M;v=gL2F4Se$*3aa~HNvs~t*lYJV98m2!{kfc*8a&UpQ1cYF zAXL#HF;oxl>xlVEZnbSa$va6sph}L-C5EyYo*O zdAp^d;{BE@=g;s!E&E~(mtS14obKx#inj)7dhOr#DUdIcAg!B#I%G86$-~8d0=jJK z>~)bQ08WNcK?<)6iRWzkQ8+T>C_~N^XB+B6y`2ea@;k;F{kJKh8IGzDnL8@YoLSc6O8klKZXL zYKv4}AtQ%@D>eZRAJhK(ACP_rIe z-c3c$5=?R%q406o%1@b{JFWzt41e;GjX4Dr>1Yeuy!9Z9rEg^ze-HM(0_+;VI#E1F z&}%UjFGRJg^Z}7d5}4)$4RYHYg04=Hc?N$Jp=jwlw*0R(1XdZTi3=v4fMPh#R^#>n zg*tQT;u^kS7k5>mvuP+eHR<`b^@tJpaNb%rnEe+v&c6^7DJTG5s{T!}U@-)mvHg*^ zUoa7E^E}EfOKO4v34R`R8eZAKr} z43Czrs6%CtZxi6J4&q1>?>a(^fq@*~PyXAhn9vEjt>?<$5yx)Ctaf{tRMJou9?!lT z2=D9oY~HH}tLnJ?-4A`ho(Ba`|0wT3JT&7H9-@~qsT@gqk`=|fw3s^VL{@_lZawes zs2_tAH|M0$bu&0Bc`#3pQHQ)XJ%S`#^VD`|wp1&Mzy+&1@mrb0CS8C5U3ngzUj2**XeY>9R>SvEyu>cl#hK;}Z9 zhoID=Fwy&z9r2Ep|AN3*Wg?sRw8)U*129!%#{K2`Y2t7xspRuB1tLWzBg<-k2^b5q z-+bRf!EzrJ=Q_Mw8gBK~N4wEj1KcZ)`)+X$Bkl*oy*0Q`0q1<<{8^lfi1WH|jttH> zzRUvYG>zLO37e+GI5!Fh;YDF@a zt%$^gf+#syi#zpVfFaRZ!LP#vSQ9&oGQ<9K?I(K6OvbAxuoHi+nNCu9)&OO`Yh}?1 zwq&Y!z0C~u&)64@)ysxEY5)_1kbJza9XaHBv%2P=Zy3V8~JX*J^L-C4Z1hpPlV_57Ug1$ z=NwLf8=SvXn0IrM?Oue0WTG+b+O4lbTYD!F_^aTwtn~o3)Rqy?QGUG&BKc?a#q>cU zAnkm!t_ud1vTk@5Qaz;?`WIUQWg~uPXORMrU{C4URYW_=x)4Jr4*7M$kvRrQj7xVefxK)& zSH7T-mter&mu>UvDeV=THm zUcLouK<4)^tzPa^CqCyGh;kEM$L?E>OFgwzBL@Gw$~#3RNi3P1S?b~-kZJerZRQ(2 zLS|zVtjxC8C)Z#7+%EIVgvcp*4ZGl)4b(gtj%1h6lKZu1pLf+CCR(rjh!NP4P05Fg z>WfM-lfCM4o+ddTB64Rk?gR4=LBTA|pw7Kz()3*F5CW zlN&wQmJ;WE5Xu0Yge6|QQIm%i#|f3R`odtaq+zyBi~tV2PJdLD`2qzRyIs-v+lFQQ z`nX_uEC(|UGTZO^@)Sz23iD1CSYRT);+dvQ zoad<3*WBd=p5EJ?rp+Qk6whqzdJ{wO^YXH#K9}AJ8NSj*eKc#qu6lbmljh4&$mUH9nTakU2v3jFqh;!{AshKD04oNuz3yoO!vvL^IC z_V*W6oEdZ#Ihh<5KYUK1U?1_b68T*7J1ye8ijw%)FAXAVoUJ(L06p>H{_e_Ly$MvH8TEeGb7_iS@%p=0 z>z}cRr;{#hXX(g+*i)NZ2Q0y>DPFA#32NeI_;3Fzs}01zFIU8$bPgDTS|5o|8`$&x zU)J9`?1lqDJh|`d!~hkL;uA1`hJ-EHg;JjNBaeY~?~#yB%(u6yM&%8FNBCa;FcsPd zR73fXeSOl6t$jM@0EQ-!b;FZW3FRNL#>Ni^A58NC&5FBzW}jrC9QAE?aYa#}F?@oG zjgExwKku>ka(aP2wtv<>`azHEG<>Hou1Sa7 z_ko@};@CQRn^$I)bf4m(dU2MOc1iZ*?sSDhNc>5aI)lX zc*8_wXn9IwmqtzYyZ#_CBF6&op7+6oav6vmk_WroZg-MyrLMJKn4ks%lIu;MDoe35 ze`L>&Rer@HJ)Oc97zU9|U~nd#f)W_`P@y||V4XCccx7qRd>Ex%Dt=IuF^9EuUBAhl z+>T6QUT+Gn3c^i7MbNsW6rdV;Q{NULN7}e`vORaGo|2F5j&%!cKs@&rxV)JZ6kbF!unO}+D7eEqj&X-4QfQ8EN4aZht zV({qi9nl&4;OPxE~Dn*5Ez`ob!$I zXK^kf&g;TCGC1D=@7>~kFubRL=V$R;7oIo3?_l_T7T+7-?^%4#7T@7LkmF6^-grLd zI((uNrIbbpatbQL_+!+}huTL7zC8sAXJ?oIc$=DgK1K-+&_qUe{LHp7H#6t1$>>OSr* z2lUqA$jcLxzmZvY*6zvDuSk-kg>a!ujC_{Ykfzv6mMpa2^wN*=0(g{K;t@@l2Kh@! zX$~xrB|kP&YU1K6fZC074tCEq$k8KRjO%-)$UI3gMi<-ipewZwu{T76tjE;dYTLm_ z{&O%cCT)Krd|AVhkfZeh+%K`LEDF5FN=NKZ{E=zIsBF*c z6wg&*Prh3iItuH6NdJoBfEXPhW}s>$y*!WYB2FBkcxx~b+4kR>FZ(con(jZ96rNRC z_Mb{HuLt&eEKYx}q#{~d3PwI#R-z5PwuJf*$K)E21XkE z3}ELB@6aaYJy_GFfZe&+tyNUr%A?}^lhNN;^A;bo~T475D9@0yJg;1luc z9NQ}ke3(VP+7~fHY0d0eVX0c|NrFwj+=(14Y50wpJyjqUVvwP+ha((QG6Rpy&N^dn zocyQ1`AL%JmM%oUcT^&$mKN-AWs)S*FT9K}=3yr1S?6Ai7?{VD4~{0~1W5o&^X$!( zPCeRJEpq7hF&biBXHbYnIRSc`y%}o>EyoW3opG{!wTyY}F-t3__;cKAhRX6U&S3O< z{;?0Pa6(4!IzIVN)L?({wA^6*8%%AC@lET?I!unbYsbIG-Kc`8=(^mNBA^l-=C^U@ zf~kL2orTK!uoXL*(J9UZPgE)18bQY{LR`@*3otdcJ%y^Dnnt+82bZqKbMEP;+#kjSX0kBu=xdiQ1)-6`42|1*c!U1UdU8!+cvdk$7w5OAa3Z&yFDVl)$}!7#a7Xnyw1lp}~ow z_P$1p?cbggZbhKl4Yp6AoEC8Mw>hb+R117rVLJV;ybp^ukET0D>PJnSN*~MgKA?i* zas!#K72!{(#r{^_10aTyG22780~i^mC@WVaU@B)k8gGoYR@aq$AYGqm6O5ZbwLw7u%dAMng1+s-OQc{GQP#J{+@9 z4ou}FezW|yi>5=D*uQRgAE=f99xA#Zi@H7J>f=MqH4S&27DoTtJ3=~X5_W|R)rRy8MG0a)y=7Ur3 z)qtLlOOLI-B50z&yZ6xdUc^`OiuTv=D-^+S@?;m)XAJj>JI@SXympT%=sc-{cNgW>yGd~bliXYn~pJZEw=``IVV zHzjJsyGj!5zvYsbyeI)bc~M#ZA&9}`Ls{}Zjr%}1ODF93bsG!$+hpu=p#uADyt+GC zJ`78-q3(Zf^Z@m#Oz%{AUVycw+Vf@Xpe6^{c?OBG^dbTJ{8#C|bmWBvU%MmelsRkY z=BQT9O)Ng|)+mrEN!;M5JlB7Mg}86-hP0lo5K%?N^IE2KHdKxsE8RWHK-BD-+PoPn zNHlM&-|Bjl1(U;G@*nD^CH@SZ8)3KSA?}pV;El4#g8TIsocG7D66cGa{u2E$h6$)% zd~#HE5($4pL!5fxu-YIk3U1$Ivu{n^2UGv<=*ia>$QAU48+fwbMBm| ztc5v@-QgL`8&ro3oBQ|3U!6moF)ubKye_0o%kKbvIAGIC&4;IK;$TE=L*UwmA}pYO zz)q(0!8lkX-*5L6#%=ex?`b@X?TUP-r zX&DML5+0xncUQk|zh@h*(?qxDZ1RE7{8wAKm|c|aTF4b8K<6>)@Zpg>Y+V~$F$rxW zlreT3+gG$kKm@&1E`>bo;)L8um4-_6B_Vg@*7|iM+JCj*$kz+aE=0C_-XUT{?mxtT zm?H60VP(AeUv1*&U0*Uk@Tn2iuKIskm0rWH{Ar(GzPOB@RQB!2daDDtr>C=pdB0(j z*;hYZy~#p`O@f|t+)ltFKrsIi?<)4)^Nye5UMli&mxZU#LN9|W?mx)P^a~gxk0t9d zTM;PTc6;iD$u7VUnwPCp@ETz4*sh_5!wFB9~R#_+bk0Pm@*uP0GJgXKebd z3<;TCFO-9>n4kI$&Xu)Vtk(Qq$f{2@^64NxoC*An&f4cYuQ&ZgcGE{k$y4*_tJGQ7 zILSd2|KLVqa>YHgd>DzT%uqZxubNd{6Kb$xdC$11XXU8YF4v0R+MMzkiOuITeZk<< z>X`d35V0MPwmH;z0cM1}-?YyEj#&wQJv<}L38-FfPTxH=L2^@fPpf?ukMb;U?_&N| ziiJ|BMN_^Jn3_TsUC*W-6s4E2(Yv=yC|GTljTdi5ETU9SrwiLKruWBPCUOfgcyPwT z?*bEKwAfGlbaRr#MejB=UN=dIs7t0=*~mmq0U>{dl=4x~>(Cctw<$c82jtfG3J)=- z37Q?1Mj^0HpZ}CVmlZUwkGSD|BLX%ZkPxwa)D`~e8I8R zRKWMzQg+_2G2v?4Z7!@yhf}}=W@`wc+G2(;to*dRJD5@(Y z@n-EbQ1~c&XQ7;x$o*meF~##4u=E_bV({=B6w2)UNd0O9Gx)JI@SXympT%=sc-{cNgW>yGd~bliXYo1v`-r}Z zw3i(ge?ubiYxZH(;PF;5IS5cVo-1CwtKl!j=Dw4HdGLx-larXID}YX>7rSW(5mUay!}kwj0Jr+{l|zh|LF|t; z3;Wdpq+bW~m<8^F@tt0td+(kF)k7~$c}{*t*U)LUD>Sh{uG4+qWXJ-%yQDfN*!vCr zOnRkqbJZ8^0r?ic_f^VYv0u6xQZ>UM%U^4;bMuxiRXakhJFXY};F-L-Vm~+ozy-enOvN z%nF{Iw}?L43P9BAv8eDZ-(XF2tb%Mu$s1s=7P=%UcQCBWXZ&6|uQ_0A z+ov8BA_@Q5W+aZi=7H&esk)5f`Ru02HbLDo*xzUlMa?%xXg@cH`ZJOSI{nG9b?sp& zVwM`!F>jNAm*=Cv7AF-fc-5Im6K(>H+^8+&V0nj^?ut;&KaN8#%%g@-Zjk1@FsR;ons#EAa@uczinMcS%&>Y&{IRln>}v|Ie#6rU{*~=T2Tgs$I52lYp`xj_lPW@)wK%y z?N)~5{u=v!jxYx<5_bv}rks%4Pt%t}_YJX<40+Wk`hSEc!S)ZtU1WjduO#Oe4*TG; zJBwLn?j7W?uEM(`)D4yX*7<7`Edi}0PhaOwlmS9~`;Xa&%7WW^q%HX+XLKn3MI?{1 z64L+F@~OP{I0p0=+35s-5F8#zS?fyEL+j9sLk$<+ljh?CIydGRAlFBcGX|spctW0& z&S&KaJm4vNiu?6-xK&6?AM)!We5aZ?dR!wGT;Cv-aD06LN`Z<0$eH_~i7N3mi*g?P zti*lZ<-JshgT}NQ7f)Tyv>mayFGeN^Zx=87Fx))~jL<9YQ{Vx4)(t(w> za-La3BCz_@FWBRJi!!IGdylrSVm<$Y?y$8x0wk&yap7wONJ%nxFjrZ{8u%A)jNU#A z=!Wm_la~wALBi4{R38 zxDkR*?FNNRbM{!sv(Zm4Z!v%#-RW+lGXj7UbnYuM6op}_PhK)N=O8#>mn##Oh=G*q z>Ci+XV6FI@!BtEIFZZ=u4N)sWYpR91Z9iDM*4M&qZ;j$1;mlWUsvxJS>QwpwH*n3$ z?YLDb074HFgD6}?tjb}(HHFuO%r1ix3P%Q`de36+{0ovo$%m*8Y0$t<#@{N1cB>@c zA9YU--eCaMh0ETQ{45rE$%USh>q5rYla5pJ2H2Eg&cBozIk;1TEfxz^f_q0c493_* zfF|@g+4++bUVC`p!1kPF&dLeXG}Fh*Le6Egc7Lg7;nwH8w(NjyLH0P;=UW7aP^~Jg zof9~;Gk+_}7$uCnMT{(V07(igGYX<6=d8d_D%BxvGsl38ipD@{8_W>m+ULk*WLeQ47e>=GynKpT?uE#kWXXth!pYdt6j1tOeNS+lQTMW4?!7B62!Pua~+Tat{C?m;Pu z?&#m7GOI{t;hBU@S~M6aO{HVfW7UzC?+^dy6&L-V?-utk;(jpPTZ8))aLzZ* zpT)U|IIj!m$l!bfymyQD!SJ2}o}a~YU3lIAzk}iXS$uDRzi06|`&+uDJBUpV(=ExB zVvw%-3fKx0-td{{-|CSiihmAGaFpxIsM@|!+jwz!qN4%@T^Msm?HKnL575HY{o%ZtHyYh@eYax1 zCtCX6{Ldpt6lxJ@O60$8kn&bKf|lQlK%a|ORE~ATqYqX?Y9j|SQL#iKi+zL=rRO|m z(5E90N4*Abyzk|N=27%Vwmpo$i-I_}{b0B6F4tb#FoI=rw}6Cz3AAn)p&R9)1)FVc zNfiDprXN2sMd2c%VYg*t3a<+rA|`wG{a^JHZ_Hk>%yplzwC0!Xd}t}KF&O;!wg!jx56$=i37J`2`%bn zE?D$!r_AQFDctqd+s6dL8?~IxuAQHPs2EkeIp8bu#^%XT~Gw{ z70K_!DY-XAD>Z{<(k;x_zt`yLF%@`nhT%7dsRIaL4&+kb%*S%BeRx`GdIbLU+tKm~ zSOZDDsC2u>m6*>_e@9ChGx#kolQhhG84<6tk(_TiU;zUDoQhyKkol6hc|mnA_*gX; zA(wRk>Uy5L_klARNwP1;-HQ&!TGczh<)sUP{w5AXnsj-vWs>ZDwxu{0^Y=-mK)4_b-y1W}5pvI~@#PF&A1`VgQoGKfPhX zF#xB%{K9I^LTPeS!w+9> zYdT}p4h4@|9=R{X0YBvQwJa#m1M`1dJVv@)Umh2&f0v1#jYmBx?}rA> zdI=L`;(o_EcWj>U?c1G~?Gv8}SF)vaEp~guG}@cje~|T|ROk>U%WxY7{{1Y!9DP|{vX#$j2MyAa6m85?K({D3l8TV zyaz6Fsmp%NZo`)4H(0=}n|m z0s%%oG5mkuErsX7yX|4joAn;u*=z_^rHVE({VZX7>jr zKZ^!$d*x8Lh*(9QTXSC>Gr;0Lm9SU-Bs2sSj#+ok5>5vlq|Q7=2WsaPxBFlidxgMu zPXT>T%-_z>V&giU%l_9?0FT7Tm-E5OfYH^E&ZnW9#CTvtqlqQuw zUA3}6YHj@+;FEa+6f4?xy1GSyTSb#ghBILxo?G$~bsilsu6wUC`c@xCT>2#S;_OSz znX#rQ#CkVWOZ=HY7Fbj=$tzq zqXLtNnZA`f9-(xH4XNmzN!W?6Cv7*oL}5<*yRX6j_QCGT1<6k~QeeI8_C*?jW)!mS zE&Af=3+&!+^;1;;M8Wv%LmQp^u(ONS@S60W0PuiA42V376h{-~OD!bC4iO=ag2DK9plYB>D@43j;7jV?VHO22&P z|9?EacOX{(|Nn1qlC5OVva;pr93)ASm6hzB%HCvUXJx07?1W@HogWf!ir_rpl zp>oi7D5x8F67ddWVO8Z}MdlkmD~)yuAg4YxpZ1z!$1cByechBN&@eGeo5nzPOef>X zoytaXtb6lL!|z%yn4{!u+^uFQ7&>W3`7DJP?mAW|@!$AQv{nDBWLKa$GCa)8dfr0@ z4j;96_>j=!xHIkZda%qAi91Gy6w=GW;l(K>hRYRDk&7W*CZdo0<4nsxKOzlp5+(0z zU)X_uRR(?|VU!16C7U09P%y(5w2*hkC=+5r;&=`DEeM|I#%i+|6LJZc&C9ino1xm{ zZxb8r%F)J(a!mkez?HcfdDUoTBKz&YPIe-`H=;=C@LBZKn|@ZK%n2g7>`czzbob>Vpf{0@fiXYsuOKF{Ls zY`pVbJeEIWm@_2FX$u^w>R71b}lX`gz)jPvb zNo+ES@_zK@DnV#5u7itzPdM(P>8|3`fp7WYeyiaoZZTPy=4YjlW-1jd+Mz%meQqE1 zsiokdXTFDkl&fX-`)9FtBPowe0V$ToE@jk{5siF03mT6%Ims;opXjB&LIPY&ge}l_S;jx}LRz4N@t{IxOptiNdB#Qrge%9KUwU8F%KG)g z+vQ*HAzin9&+Da)kPCgybLyQm)cL(@5k1mAFo7hO#S_sQQ) zdqc--1b15&Y@w@Kf&8u%UQkd;C@J!04b2dumeaaKg;jd3Ppb7&V`rL(&Rp->?9; zO?)w`VNEawJ>>+>$F6tHpIB}^J;0mea{d$4LQHPMG1rU=*-U?tn)-q!bdP2A$`K>V zJA1KmMKnn8D-AklgMAf)G1vY5(nRQ7*tgqcr>fDght>N(mG7ecvy2tKHdzqI-iPRu5MNBD?AU|9XnS8~h+Zor zBEo;2V~lN0<;l~oYb09L&`oO5O%lIaG%4p%qWQ!tP*pd47<|-jyU-z%yZWE%TW;lKO**r?-Io(5~egeimyK7xcGza2y0=+63W2oSz z!1U%-PI$Hzl5e|a4%7x`Zr4Bl57kxEx^bfg#yla zh_y-9LIdrjfnB*|fJflYIg4v-;0;;no2tqNG*>l=JI1?!#f8fq;SgR45H zb~m%a!SrqR+x5sC6vBL^c6Ct{W{u@KTu=-LZAP+2pL%DY=+6Vcl+Osll|t7_7*baM4>L#qNNs>#X^w_I1e2i~aOT!d|TMb|0CF>EG5DrnBt@ zLoc9JDZR`S?u2`k`kQl&=o^$fDI~>FND%oww_;S7ege_8R*ba&`5CG>8M!X(^b(~Q z>f00>9YxE^pNBpB%8%^u+&XOa=S8Yog8mE=e79?9%rB2Tj9X&XMD_$fSmcG`nqM(Q zKydV)^!Y+sBs?>IPy0P3lIQl(hrpkOmN4-S0v8b?5Ynhp0Yoovj;u|(OkBk&xKNk&ASQtStx{E^D`mWh1$++c@XjjkSg*b zo12spP=`?PdmGQUK3>-!V2vaJmgJv=`lN}#1MljVqjz>T>gTWn?IRSCkITM6?}cu~ zIdB@dTReSc**6^ ziGJS=lL<*)Xq}*%nv8KLd~V6A_|Dz?fgbN{GP>_X59Ypyo&LK26?&&jb%u%X^B=4P zWHbI_0MXxkpA{tbLQ9Lv9&IaaDxmc1hw9X2757+{4|iTCpmVFF!Abg6D5o;?Vy>V9 z(!OO{_o(fA>#~zcv%JW%%FevQs9I_ngp}%sIBhqfq5pAyqfJeNT04W^oILYe<&T#G zAFJ_RYsYYS_I}wP73sL!^lbw7(RV(5B4G#j(41rDSGiu~LdJ4|H!*Tgl%81m`1mC{ zjP`@jmvo;~*w%O7_+^cg*kbKxMXRuWl+>z{S(dvO6~El66qdjP0wP~ISZKCGr8RJy z!163wkHp9PomL?5T4*PSM}9(C>+gSGi1~>QX$vgq!xurA;hKr`xu1}Y&5YX!rlqEbXx{uG4PW{Gp0?H9$`48@u0-B z3^mU4uNe2}fC{y0dae@kB^UDM&sW>8tFSQZk{*WqZavWO?DCyMRr+qX|KT>zMn?|M zQ<>SOq4b<@{|l0R20c?1{9d~miE1p9@pVIFaHkGg;;2e21Q*Jvg}}Q|$wM-Juk9IV zaJY#+=pn&-=Gs?&Icgl0rg^F&P)f+t4~W_{Gi^e7@SO6dTKvR-k9_P3Ew~0AN8AVsz*5C)9Iss4o2LcW7j0 zN4zx93FxS8?=lsCge+}uy)%EasWNR^kWn#-%h9# zdWFVRXzcqRA3*avdS3FMev3M(L};5StU^<|^rZTqNZ^nke(Jhq8_+S&Wz}wlEvWg& zAph&r*8wCRNF9rOLFKBrG(NCwLMg#o%K0-Ez|MuFf?{7@QFXmy%3>KBa|g_f{z*K^4*}n4W2?pd^}YKxs^RH5FyBh7WgxNsw0Re!0LXKBVM${q@Vnl~60v8D^=PiBi$1 zL+)k8Xq?ZRt375U$UUfryQxMLQBZzFY)0^kA6Hm1;UM^K=gH;!mI)rl$h~cuW!4`m zU9|@fR;T{9CMw#F5WSvIaq%*Lq?#oOwO4tm5%{xEE{a^SQgcTwQ|X`fdH6xp@~cXT zGk;ay=`GQU+4iZt3XOWNb&(yor5QUu$jm?w`_HME)qtm8#FtyCKjC z+3c!kE$>tWA*R!w)CnqjrYwB2l)qJ$Sel-2Jf3OIRV`J0OPFV!YHpu9dS@$7KUlOL zS4Yo{oK=4ma{~1XQ+=Y=B9Ho8`@>yr-yoUNCcy~PY3Sd^`BvBEU(j#0{K09VOlY`* z>45SV;ZAm+cR99(8JMX{sj)I-Kw~Ac4g{47s{8EwR_^JuU~9Tq?Ne(V)D0$VF6rl^ zYq8pSj4#yzQP(itc}jw}@>=2_-6dj#OJpTiWOha64a-%&jMHjRo0R}tOiKbfJ>mpj zJh%(79P=x@*2jwsRjD&>QBs1>qH!2>G8LMo&LqMbbJ4Fo{*29bcIe;hr_Xx(udARj zNRPzD%~n-jue;LYA60a7eK@q`9MFbR%N2Hcf@hs`y0ulp2a?n~JMuz?5gL2plc%;i zfL@mv^sjj{gU(G?mNfpJMH9M4J=xUj(T_^m*`^zbXvJPWAnV*vd5C)DdWb|q!doQe zN)&bIMo(%{Z8$x6z3BY-acmj1qG#AYs$7lU-#_@{eUSt37M*0=&@F<{#g+Gc&%diU zSwvPsf*q~XNr5Kz{&vvQDTiIXTM-aIO^455foKiYWNN{O9TleZ1@^xTE3Gf9+sqw2 zJy67|VrWDn9n${Caq>G;B8td_E&q`yYGv6}Fw3g1YW<+~RnLbORe5<=Cf+;H8Ks+0 zv?tmzK!aCf1J31mLnov+`oc;yP-fl4H*F=P@J{Y84La%`h;FXW*oKxAzHwSPJzQ`F z+D<=fCh1OtUDud}uUoTW+H=JJ0t891@k||_P?;rEqfAJpgxdv#EL0j3pE`u{h7-ci ztt_Ej(Zg^)cV z%EWtK5qW(nX@NI%3=-D!&n&+CU+clOZrGSGCk&R~NZL~>ghKrj%uP87oM-PlRiBMA zQTRU3fVv?gc=Gf9Oywte;KNkEzI+)(j0~1L#L!u;Lz^(W=cx4G2od4&XHfVtcHYzHGEtlQ`!H?Cb9Bem!6GI; z70~I}OXvEpq5%}k5mCjj(6E!i#@O7#hq|6L=hPTmM8k}V-gzAMpt2{Qss%~C zhEUcZm)<^d(7$2#t?x??IzjFgpAq;8l8rWH{Gi4G0%9IX?LN#yM}As{ZI!n{IUTt& z8u`2+P%!Xb1X~f5Fmkr*LiJW_&|bcQ!@!&hSMH}vBZH~XtERWAeE!Lhqlk9V1+)MS zk}?)hwLWY`GD*mpnW=z_qt@)nqI{@qMm@S;&JtpC7JO@a@hO_2(fxz7vOp#2^S%~u z_DZW8^@AWK`$?6n^6v&5ja{LqW;L9q>LF-RRPQ*?19$Y&W7P@a^AS*Z>BFGK?6>Hl zykdjQULUI0*dHG;@2x|&zS^ZIjhn^gWzx(rnzh$$P+bmdt*`>KP{POhu{Y^C}X^Nta zV#EsyT)D;JXs3>{^5)5VtrPC1f})DngU2v2U*BKm{N1SA46TS1MuWxOBeQkq*+w5N zWqdr-Lj~{s&dJUsaR0sj-Xhm~wG5FS@|^r+Oby??E>O~YED7k_i|wXZ&7#))R3!Rw z)bMeOxN6d}8g-VmH>r~i~!hGDxF$OUS;FFqP7)df7_ael~pu`kj7% zD=;Dq?A=v;ocpT?x~kr?{>hLP#E>O<{$-q1k^4qAJ9;K(vt5g! zTDF8AY~3;dIM#cgDd$1{40Q@6v^SzmlzqF8HeaEI{|-MCb#+0%`xJ1_H_o5MxrjKg3+Kq-d;`39 zYyN+IFubRL=V$R;7oIo3?_l_T7T+7-^DO?(&X*ZT^fh}zL}f_Zvkgu3;@z<*OYVCp zQ`EA}szDJN1pV?d9C?7&YwUggwfP20GyCGeYx@;i{U++s#Xf)r+!Q)(NmGvMzn;-N z@QOq9r_+rh^6R0#cf@bhh)U52r&g(iRf5l~D#yt2X9*;F_E~zUOeOl|`ePB1i?N6v z#qrM<9=wIdp0qT*)P9flqHoQK=t7VgI4qR^>079o>+87wD+Unq>XK1<$wBMElFQ~v z1y|_BNZ~=`N(6diOy%k*l^rx!)?L>iaTZ*5-P$M;C)^eE>a~BnK7qIwicZBRRHF5m zS-|ij+Q!d}c5Lhlwz6W?|-v~oZrXEJrI<6$%` zOd-Q9i5MKqFzQ}t*lX36e41(@^Qo0ddgD!v^mr?=^+b#tVLxm8=xNME;dPuP?a4$9<2GKet6O-flGA znr88^+!#cMT!_fmZi4c;%W0};X%Wgf=@34P4HY%tsRc9pR5WMx%2hwwr|7S}2y-~T z1mZ28t1lsN4tL|bMRUg)f%cP`ziX}c(2k6R6On08QNTUze2MreRQ%n4&G;2H$huIc zL^itB8Yj12FbpPI8~KZyrK0$1ug4SF4e%kxq~CQiLQJ85zPXL5 z)H%@Gy1|p(D1;2K?hamIO@`QL1-xl)gg}hx7Z-h14H2cHSwzLZ5TfF_A?tgK;QO2l zsog1ERQZJwpQUXvM+2-oCu9{hA(~&Sp074Mpp`K0Z{ZQr2+5nVLCRMYAZ)SC{s~JA z`kFz~eeg>ZIjk`h79H&WHpKBV}cZKs_^A`m% zMddNubzlOYb3f41dT9sujgMcEG;oE(N$7USJLzC9137M(Arn0Rd)?`w$SsWMBkCD= zg9E;(&Fy*+@etU^jFMgT@xVe_e7`?SxP+OF>2$hLrXfoMh+JpS;t?^fjz0Bg`bQfn&|L&dajErLIDUVr;SCFm}Je zjy4ey|F{~}!lQ=qa#wu*$0mj?nw;xUcNf5LuQ=|z#XXF;9}M@_;64SM^NsUoaV{dx z>%uuQINt#8-Qs;Pyr+QYXYpJYo;SenVEBF(-y7ibEdI_)hwOwWvwVZwo`2hCU9*Gh z`^!^jt9#**A;l8xSu-qRKjuH#`xw)`>iIJ7*GtTAdzZ4gvH%O}TBQAWCKxkHk8W!C zTLfAv+SLYf{IPRHvl5>hT(Hv9e+>c#D=)6chjZ(*sw&kiE;9@yI5&6$d}hOJ+fMx(}4fRE%tR6m_HHm=qt(WRn;ROEeTJrnAQbdJQ*$kOq_ z4p0c^J~=V0om5XY;d=spQPlG=_=6*GjzChhtTn);WkD`Ce}3$N8Ie_8pC&w5f6Rg| zUk4M-YZmLylS4jMlR9tTi$YEoF+YA3tO{e5B23qUjNq#2f`w%-eQer%X0YzlUg zeX6QV?2v+278`e!Jm?H774r&EPvo}2cZX#(M}|9ynUbyx63M2R)tvu zvR`jT7FPw&|RH^IwVMC!iCfJeLJ(OufmUb6J)Ued%DcJgSC$7On<`sn-R ze>GWxM|00Y=X3*0zc!q4cu>*bZwYsYuOas}ZDE?l_C!Y=Sxischo!^)7?w0vX-K{5 zhNx{y_}pOXM3u^&I)ZL1V5L2Q|Gm;nLOg^DemvNiLcxHnn6t%sjBQVYv1;Nb60X%C zTUvAmt`+Y6rWSu6V3O@!j(ZVEU8v0uLDdD6x4d>CCA$YIqE1Pf`!x@-oDf%KeWM51 zt~S;^X}FDy-eywwsb#^mbth3n=^a#Upf^`0Xt7FACVN$eO*0 za6=Zk9iWdV#tC^7dgW`DpCPfaTS4y~CeS{q(e>vkS*XI)n<29gURa=B#FKTz731N3 zDL;7A5iSotWtHHjf#vSZkJ!^_VOi?I1NYrb5W9OZqB_K4aN7khe`XtTxHIF)`^R2> zAm>eUSgW%JHXqWoKu@d=6THyFFTZ$$2{z?#I$Ro9tD8Znf~zN(__Nw{CW;&Y%E432 zhjj3;5i)c;m_OeybF=k0 zRyglAKlYgpBO^4pkKWm4YvRN5vvX+RJA_%QpAE~-kJ)W`M~*FWzBz7jOa`vGaBqvl zRsqhlru)rfr3&*Nly-Ml?m|IjVhqa-MTpVe9LJl^YFL{ITSv#`IjBhbyJo`DGsHLY zl-Hq_1{S`)dMfOY6LWIQ=BrRHMv4k1IA>8YtZd!;VzD@ZGokz}%%mv+*`P?7e|Ccx zR^)z!Xp7qcNx#SwwU=Cx@s+ah{Jmu~Z#ikR%6$TTj8yY_*Y`pv_U#-EhcpTEswJ>dt9+1lrXgXDElSf{eYvWb+YE#ID8Ocr26`%Z&Yb zW8>}tD#IY^@mOF3C2}A4;3`u5pI03B-Qpfb+z*C(YjB?e&iTgqvp5$K=XK#68Jur` z_iph%7~WIB^Rsxa3(p(icQAZEi|-Bac@}?XbM>>JE?q@4yB3W(`S-8S68m7a<;MjE+OA+ChcDPAa;gy0C)v5-RXP~O7csXJ z8y7K?r_O?B#HtYEM@=yLR10I#b`T~u)4`;Lyy`Z?o+J9rBoBohE@6vI^!0y4H86HQ zTf5sL*~nqRSPF@S9Hyf~bVXI*49HD-$|=19kmaqg5xtXZ5cfl&2D|2IDEfBN*1Nfv zP>EaQnG>7lpq!_A2bnMfjI9x7fp>qPzFF_?+h1XU4^w<>Emjz@kY*$J?OS7j1WoE<-2dqxZ`xzkrg)Sk>h#4S#W%l<9*$o6z zaAYKSQK1Dt&PN1aAG+d_tV8ffp=NWVe+YgPNHm8}o+6D4X%t_@!avo|Gj2aW;%E zASXSh^(4l^R(nr)O%Ks-Ad43GxDBP*O-iqaabv3IYQ+cXZ4e@wxbBp`D2S}L^Ml31 z1=NJ*`*6gH7b2GxUey>)jmfsO{~lD81NJf*F&wo5z*T@FGD>>}a!PHMiqiiDNt#A= z9G>4nPcKO3R%#j%a^EJ#XR8Ik7wec<1CJGGw5DgNjARz&$k6do_kM|UKu6HlRtt5y_1uJ229)Nzz?1+?UH=-ZdtDmzR8%EiZOR%GP zc1X;g=;)m_^ci+qlh?tPcoPPXmCIs|N{PR-f^1+A%ODuA?+mXniI#1v-GPaMQ@(aQ zr-R*JyGXW0Gy?Lb`=7LmJTW3Ksr03oJ;*zBmT~r=30&}%H`aLTjZN4aig|ln#VS=- zoOtw#fD74w#r;3cumWvi+S@a-@Y^c}7l!R00;L)E_BQz|@bU@cF3zY7uo-`h=+^UW zq;~JpPI86-HmZI_xM}qmR&2ucOQ>oCl6afuH9p_~RHDX44R2)v9Z&O~Yg2ldYkdg% zC-*EitSO)-@q`&Z|4_hh&NUUZsk(+v4<;klv183XnL3zGS+~m_A!#h;>8uA5BZ7sN z^v&OUa2*kI-KMHBiT?t!gD#l7aUI;XP zSC0wby?CZGtq6O6ol0#hHykV0sm}k2)dO*n+(EC^gpA55T2YtFg-6;MFT zWkrmAu=315j~@yLVCTSlNlk06BD(aXqld z_D95Yf3X@Ot~(FvK9%|*PW~HQzDa`cfoA^u2bURPn3kGEQ-vAMP8e#|TI#A_- zNcN35$7`^~QmSY3ye@XmdVEyrR|uj&>qny96AOmcb5t{@;}G5p=A{JxG^Ex)^o!s{ zMX6f#8wkEW?EA2a0>L8%a{!r{tqglWp+yxJZSRWM>)H<S_pFC(K2)h?c+`fpaw1RbM}`14I?iD!&o(pPpYM9XYN!0{5u3{AWeMNaFAV z`=gxY*?LPz(KZ9`9nqdnd=w=SXe6IbC{TE@IACp-APgen8fC=+(>Y!RKq zZ*ZSDZ`y(WPLLz>DDlH?vStzE8nr?9OmO^Mk6&QY~ys#wk-X;xb0J`cvzt zzcCURr$QxeBo33?{vk8|NOY}z18A|t5FDu-gADbuBRZuW> z0$et#vz-%eNPVFltXsqjpVr4F{WxT?Ve5dIa34`b;xi&x3j4qBA_z11=57 z+qCsp>WntngweG-c@MeZF~@=ie!hA{zN~PdncWUE-Y3`UDOSRu2B*Bxm}bO-_na<0 z!v#!^8k>f91YlBoa(lMYr^sEUo9>^7Ah_Y<$KHmtNFXTn_2V<=SVZV6H~oGh6UMlx zApW~?5A|rQz1((V25Pbl|3Z;+4f(_RJ!#!N2$XfaTh6(miZPT4wN+$uV{8;=tGm9W zu*1ZKFIveyfa$RwRaR#llI1?FO>nXXqw26aO`c=cg6s79oE2*Nu7vy24I$x55GQ;J ztJEHQJ8Gi;+PoIgKa)=}X5xzF-#bAeGiHs2q*ZW{+{;Ju{YTr~4-GL7h9DI~O)hYY zzb)O&)D|gZ_cqfaW`+OWWMXSQM+`g5zoYm3w*$?KO7$Dm-2@IBPpqpo-3S^5?pw9y zq!{UyG|ALl1^9MVSX}|(_cbqh@R~Nt8aPG%9ny8SLS9b^{#~Dxgf&-PsUul6;n;eO z(QE9r3Xx3sXB&&Wv-gI0cc_0Yst>pM})ua#VC3RON_@~jRadNv5 zmfp_m=X{0?+ei3}v2GWnfK$|UYoY;~C%Jgsgi#7Z9vzSDK9_*BdAqd`3ARJxhbfp` zg96r2Z^GX{8jWmH?qaA7A1v+Ex;HQ$37SejxdvGIA^vg$I^ggR+QzB>=p4SozNKQs=iXMZxJ+xaaxJ?mz z&uBl(nE(*v7IyRPyfyM`vP@0Eeh4k{KEe8xmEc*+kGuXO7eUX^UG|jHszL*Kca1M6 zJ7YGQ5^q8coiU@Nn7-kZtMI~$FdbE6MeI~IC3^S13a0;uQQ)?m4ZaofnE_mZo zF4EIrC8}>D{<^a9)>2m7-+s}HXi5%?>(*BK-WE19Dfoz4#NAD~v zqs9(0yxB8w8X*{(bQBZ2+OV%(M01 zaY;;RmC3GVstE`e6dp@a4}p2@>QkxZEnrWk^V1G_oj@sjDxh`N7*;a)R$9@274t-I zW#0Wz4#ElSo+1%P?DJ-LNKxK1qz#kuXuVqsT&V1nuwT+}SA=A8VjDN?K&S56&`X1D zX-`;;N016QC)5KoM7cJpdDd(lBKnM`A_i(MZa;MjoIyigbL&f`_$?!fWHS$30 zErD}GB3coZ0_VHjT7CiI;Z(gkM_T*0V1cwKWbR=i98u)2%@mvnpR+b{o&AD=G1|?u z^5gg6tpoczo06TXNAalwNG9yHv$dwoY+MG^i2k9<19&v>o+^nr@FX9Oc>1x&O`&Aml7*ZuUtX z<$ucL^&TDFrRT@#X?A0*uu-m6MhZ_?IAR@!H8?B``6m4+-`*xE*|pUp!E{(MMk zxo>E(AD^@G=pUQF4a!Kzzr$>h z9GcS+_LOzxe6MoJF> zn00K}LnEFRcDqZU6SRjRMpb-z+kd@4-56bBV9`b7`@8YStyZs5CLNm4N?SGzjVH0& zCgd(h&66+MuAhK)h|6a#5bj_))pEW^``Mwl!!`bdy@AOlIDa(HjxhS9akMx~#&cb%=Cm1;sPhb&G7G-02qmZ0W z4Zh`jwTO=E*CB}c7CdFcg{;pQz$_hRw@=E7VeXu}Q|Wv!kllkhe!BZFfR*s`hjWkz zOvb~M(PeK4*EYX=bi6_nUic{cS!C4{WW64Ce>_wOB;-hSXv-r2!~GqI(p(mf($19N zyh#J&UUA%ai+dPxKN#+ui=NsqG{*R0Jf4nZ7BZKn|@ZK%n2g7>`czzbob>Vpf z{0@fiXYsuOKF{Ls>^ifV+T6kyxbK{Mz~i8HINAN9URppQJizea0-gGML`j1uE_x*c zrmx&Rc;xZ~4vb-nm1um6gb3su>Zzx}l)h~PK5Mn>2vr^qH-(u(4-)UW z`2wzB^+2tzQXmL|t1q9ujCIy>R$RI3fyHf{8PNM+3LDs(G+7gTdlTwKgS2)f$jFYY z5jA%)xGnsG;r19K#!|(+ru>o<9_;cLS&tEhwToW@f`1yzr)WI=hA0Y&mw(U|+!qHP za4X05JSjo+A8)1-{3h7%WzRYScOSO%JI+VoLBq#UxzUyR^H_J~h~s}mL5R*{=46M) zP~dZJ_q5o}5M)7Ym{)!z7uXnXh931ZBU54Hqu!9E&^s3$LY{qbo}29`XZb}g%Hb&g zb0{NQ8?}53GpitzViu3YhDobpIqjpdyYGdHD32M#V#Ny83nFsxcGT0~&m3a_qYItt zeBl;Ux30%)P-Y81SIn8wy!ZmNiC!b^B^H8ztSwdX8aTnTVSc}VJ#Ykm&90w?Wwo)( z%aa?9#x)4fi^$uo5@lY#Gy|DBe|=>4fCo!wJm%q?dJBpAFinv` zxJUaf>twUrsKKcV7tOX$lEcwZ@)f-XO{_9x<7NA71Q6o(GwQihg7mpx`I(@c0?dXB z>8Ua(;8W6Wf`6GgF*SXEu}*KA{zX_+CYaI%fNZ;>AjAzh#L;@x;+XYK`M> zS#|~CWRSEWLC6~fRP1{k-N9ym{#H5K&n8&hBtF_3)XbNd9nG^}yw0zEJ9=ja^iNS; zHiZY~S$^zYjK%(5Jxk=vO!C@AL%V$GdH}|~dB{fcUkuil1e`X9iC^14(w1 zcIhs85kk1br0pa-6TN;X?DpRU^%%_hW_pPFue;doC+_zBEvrcXwz!agR1ntNd!8)# zS`nf5J8!NrRtiXN@0=@C6oYfF6-=IaMR6eKWxqX5z#p57qaq?0Nu0f>GrF(V4|G6$K?GsB+wa1J?q8}?;jiA zRfwU0aj!V;yTv_>xE~Dn*5Ez`ob!$IXK^kf&g;TCGC1FW=Kp%PcpnV!Dd728JlBQi z4e&b{zMsYS2KYRSzq8}DyH&iLjj-}Jw9`1X66OW=r>;aCg3R>GuYIy};a@HNlfKgx zaA`r^?SrlZa3j9SI9(|R_K^2hv7|4BE89vbCt~-(4Wrq`k*++#_ukc*=UZ8De_=cS zXOb0gd%CyJkT?z|e|xGsfif1EJ^1dsXq^u1nvG@)t_fkIRQZzm!W`I(HR_@B5ilmD ze36Ezsu+1#m-!(!Jq-k$`#d4^<~S@bV%ZUyu8SeBdivr_FJs5ANc2>GC`9JzS2@bl zDuE`w#c|&=CRl-is74>9GiL7hrHkD|79$%CB2Ars0Jfb~)%h)+Aa|{)PCYuA0x0+% z8xM^L!1GeqInXRZ?rLGu$*ZabZG+!+vfB&5g3C%tar8Gpgq)s?S(77}?^xfwS9qV` zCwyDS!G98~m$0cL_)XwmR+lUScOPzIkCOMxV!|dD&9O^M2FMb7$it7jw*ezpP#Tr6 zKjJ3x^@jSF43Kar=}<`M!7nKXY~CUCGua!doR4}#wU%4CM|t*v=hvgI5OS81RhR!P zb$Mf6@4wkH62)T2)n#oA{)@-182WIzUUGy_E}CY2J1+;DSEVS~l{`e&@Aa&8u8Cun zkN84!EG=Oq)#mZs&F9G0WS*$#M=k6caqrzOb1T@8%ef0v?nVNezEVS6rkGREN(Ney zg~Xfwu=%Bz20Ybc%`%xUr}T-{O(NVWwOK6E)CBUxM& z-+zk2YZ6@cuC<~tvi1BkF|#bDAyxWY%gX}9CD|4>w520A#LVofX+r>GgSuOs$_Avb zXwsg2><`MD%-S~w>d=82?(4;;iQ(h`rDr=NuGqV^l+E1OWQ;|2?x5*(KDO2wE7VtD z4X-BN7^jK50IO$n48fBRK|$EyC5~Qc*zR9Hx70%ejM4dLO|E=B_}b(eG?r-#H+8jN zalU4O#W+b1I^O64HdDKw?({grO;3vB*?geEl(`M2qUjGUZNi*rX9uJ>!B|ezE@*o z#BdvHaVVfRJ1hXx8o^ymT;bR*Hc&0g6oJLrlx5vgsz4?vt9`Dox?o(tW+JouZvx62 zjMzqW0U(K%%BLTqgdd+NWZoW>hMkC)nRKa&!s71WFHymo((5~TJ`?1SRY5E=C$g;5{EpTN5$AO^wesD#qlW&rN-bLyk7JORhaZJv-vyD0SFckHApFC1z>oEm~|LW5t) zf~cqI3G>qb?-j>=x44H9_k-cy8r-LVbG~u@EY3y5d0jY12Im{#y<5BwhW8Zk{4Ad9 z!t)0B9Sq;k;(G&pp2gqUFhxVCE2;^WKyFTb3wwfu1#_qr9)`nm+^E6$_eR)z@yzsB zcf$K7WAe#(A{-{_J62CE@(LE$wtTs^U5-#?jqKY$PlTJ8stsNTKZZqVvb&A)Qjjl7 zVGD%?_u$m1guSVxn}8x#)NAeBJ%Ewbuo(sbxKf}h_5N>BIId-hiZhV#e0!kYzA1KmQv`hh`eUY1CT@HZd&RXE3 zJuTSRdy~BGi4C?_U^j8nT;&iGyLEP%|PIkPSw5@DR|ho zW_q4%1=`<;sLR}vCU}Tm?fK_IaMrQ<6n@S)r0bQWqgziRC<=6JnT-wwtEo1nN4~zD zCA9y@BenDUUPPhf1Z(p7|hCYs!uwL$0&L7#wzF(J>6-KmhJs=n`l2C#;w6fH2WT@Gx(6VsE)BsLXV`4=hZIt@7j} z`ot8l*DL8g!Ur3SjECpm$;`*7Ewf9vmtJ5DF|ATDI+a*R?zG+KL``h!z_6e0l_gfz zh#A={s)1YYmHd--4d5imM0}b%2qUCz&Z7jLUG-3h;M9l*>^*DDq+RZXtuS0O42NR? zcIT-_$ThLE5?ohRf}ip$Q39SXsmuNG3m{k7q+8(yp4UoT)xnMH%V zKVtz=@>5C`x;Yh!PJ}BUQ3U()ZSmKAN@MuZ@gnAQQ!OmIIqb=gOgCgic5+gr&k0O* zTm8N@X%9~2miC&PD`##sb^rpwiFvqJiS1Yac1gC4=cv0++zD9L=M(claVrx zxr6Y35f2MaxQB%O8=?Ql9}NypJe|p0l*76Lv%XWDvWMlmGGVDR+JI@SXympT%=sc-{cN zgW>yGd~blyv-ms9NV(Pb8N?C&B{OZuN*oYH}Ra*E$q>QTd{qz3+U3#GXiohvFQ^>Vh{y(11!=20T@&9Ha zBw2|<$_QDR$9>EO$}FR@XN2sP?Cfj`B_Xq@-iXud*p(tQsF0OZ$f{Jz@73q}-QT~! zb>-?f_wziTj|Y{Xhezi|@(SmJ5jE#EQ)!C@efD*BTz;Y8jd165?}k76r2~fC4V)k|$yCkU&6PzQ?b9M^C z!F!V?(|_)Q$%yXJYdmG>`bsOk`;8KGHcu&i(@7rOr+k5R^5h3f_4Y`6Cp}u2{OZlw zh22*GlfOAbrDy@_Z9n_LvDyMPzT`c>ZWRgW#6F9!QQMvD8g(3}VxFRv1@_<8JhTrR z4-8i&-tDCN2&7I9evJeykKP%YL|Va$O_hmkE-7fcJ99a$Tm$;jq%+0O83^G}!EB9L zGce_=%4+;GhnBpeD0}oSL7_e;(~#^oaP+HsbSaY_JYVv4I5STU>krdR9X*hXoS=pG z3FCY)bx5`3LRu!sxQSjmJa>U^hmO`f?AFJj{HhwulzLzkb~0dfUm-{g6**m;)q)gg zHkUlPp#eEseWR0esNYlHqZ`A2(qM~K7H?VsU-X}AiN&tXnMgDt&gn~d39{Re%^mQ% zg({j2U8y?-*f1^SK<&>$`g?i()Ls{SyEUzb+BblpWq6wU4u)5cs2Nl9S-hUVFX!v2 zVCet)Kbe2`*(!~sa?k5G5uN-AW}2Z&+^KV64pd_?!!l!{}v09?#RSCbY8}F^;^;{wbYPw zSz87nc?o%mi`1MsPGD1p^{-zYWN|Tr=6}1!I3Uxu4QI)?95ng-^Xr+S6i{()B^su? zPQ?N&ykWePadT`liEHD_a5ZtiE;%#Q^m z()I7YT%oZ1r+nJWG#zEfBr^A96qF18Rxop| z0kNj0j_F*ffpp86g)`QX@M+kLxl?~LauW?>W{$0b;cc{Bt_(47)31%)N1o@SK(*{Y z%4z8^U)g^IR%fGoXRL_6>y|UMDdKU4j|yG}+oA8LJiF@sSHYde zY(1Nk0m%5BdeC^UCypPENmm>)!{(=w^Io~gz|sne(e}AZpvdFuTd%AB;Plzv!<-ib zapXIWOCfn`OU{W?sG$}cVUY%sR@!R+y3}_V% zb7%r48#k`Q>@k98`bhFq!zs#4%d17|4xZwp(=d@?GfQkzP;MAI?g_JVR|Sp< zg+Q5;Od?T2K2SvQNa)8%KRBSfkivUN1$%7OYTAA~9YubbyuPVh4f`23Jh*(!0>4?Z z-*j911}Gjhza~3sg|EKK)}B6Wiaj#q^xWfp!S?N&eJ{Uv#vd}+Jaco;BRh*|srR)S zVAg6dyi7v^XC~G8|Ed*%MjwQa#cdLX_CuV#yN+k0_$Io*AD1g0-E^D`mmpf$2mngbM8OI*m?1Ms1 zQJj42-NfETCQ5D}HHR^2W|3hFg;EP$U#1IVVbw(ziH|ntq3HPK>&8<4aFB{94(>e) zS>~cIeo6HOqDfBDS+A+@VJmeTNW>n-f58@DM=8I|2h<3!8S-dxNtPNnb2K_@HMV z$_KWy`BA+%5-e$p--(vSEiHKq5?H8B=NzL?E5T=tnKKl2z}BgWPXTAB-wopetZfFL zC^Yr?spIj~9v<_vlH39T+uIOm6+ z$(1M^A@(2l%X^X7^R>BAy$gr4JuBnH}eOL%w0*Rwp<5uKkJUHA6O^& z#I}Ch*>(UX_~FB)yZLd`Q;yUP`($*yIo)PwdM08L+Olh(l@KiWeOJ_R$^vFuxoTcj zQ^JM{Y9HC#Gl1s7=QOWq_kl>|19Bek_d=SRA9IggwtzF|zK`dt?S&jeF7A)%? zv(5rA3$%XTS*ARB9{4NH9m`6g!=8LW-U1MrunHNkd#Dt1!0ZBW*L{MGV{kw|E6%IsNy z6_n|FnHcZ_z$l}X)5+J)VK<%R+jg@_z~Scsm7D)LpzIeew0e(p;Iw#rweC+_{B1;J zwN1?l%kXKf9yoax_?^#ado-Ja_U3Q}DBEYF6vyWu{;>7!n8%|Bz-E_X6V-d zQ2BYdQvD*my7PC6e)I3+>dT8nZm_Zvbq9~p2acL!nGTp%-|TG|cQ}5l{jJ{TdmMf& z8!ND_%@r>6mDcF0)4~qLro1Yy3uw_vFWz!}D-PbJ``^8M6DTmjXXe5a4J@a!W7=zk z@q}jYXvTyYTy_4#cp>Q=;Hq$WenUVLb5Taa-~I|jYPY{h^J+!`9r{n-R_@TiXJ77} z&*7wntzjktUZFhr8D5CluHy(8%D&CW=;;C}O&!+38ae1=f4oPQV8nZK)wZ-6AHcV4 zmT!Hbi3D74?aQNVqmk2j%ln4H=0HYWNRj{bGG%hBx7D`mG+1Wt*G0o2Mp*aZ>G6~= zTi~_RL{lAqAwHcw6t`?xi{*Hg*=NJ6acu72g~@Jh+0(`05^bjY{ z*4HLbNyT@2SxgAlv3tM2sw^3d?LO{ekmdtZ_@@jm^Y~Kx5(l}$C>cO4MW8Z}$pwYezvLT$w&;)U1$q!dIi3K# zsN%B|;*kK_zKp*tk&Sff+s*wu+EA>=-3+~pkzhJzeOpv!IN+~h`Co4EIN0WSuuU!E zSl1sDm#YV~n{^j%ZBE3xgB~=iQ*mJ1Y^&d|p|>diSe4e@u2gKJD?pek4T5j%tL14# z1+WQXV@zwv!hxz(7;xh=ko}$eUBw0$wqvnR7U5Ndanp$`2{mT?Ny-d(A+hl@XJeXCx5gmkPI{BKqm&Yk=G#!66n8e|v@4L|E z=VtJ|%1^&Tbd7-Yij%%u(!)sl!ANfn=~Ey%-z0yQG zmh5$reFO3xjLc`r+lzTE~_v~rGo3;2Sh$}HX=xI%$< z#2Gm&8K=Sf=7Sp&Gs7r)dY8n7;@i;X@X9IQZLdJ#t1W#u;&pNAub*^$PJ3|_M}?VK zS`gBpJ@Mz{laoN&tL%QB!x^-VCV;7h*%_anwkqO_Iff+?lZ*Ga<)C)kdrSUBVJJlC z^?@BW1>kncb)^fouGmPhVAoi)32x|jTix{YF`_A$IzN)%gJ|E}i`Y}y0TQ<#RyXTZ zfgR3|iHQhNEcTeMnxRw)9=iDM;qAfGh&^#|OiRfVjdZA2C8_$OR!0U^^dkbvJBQ1} zWgmrnMs%xFUYdC4c=IL$cT4E6>$#ifP(7Gy&(!H?2}P};*9X3O6@z_#O`_Yh%aM>d zXDij$hx>%b|580t#LU42A*$bm^5A69Kkh!R;ZJ&V-zgZ)dMNhvgIKWrdi%@bg%IE~ zqDOx--ybpLj-`~!`yl4QwUr`{V({e6vXQ{C4ixv_j)JDnMX-4DICGln7%Dt_*dXP< zJaD2YQljUnBzE)6XR48}KxHynNein5=)0ok08^MQj$xQJO}Ihgh!Q5({cU|64_s^ViIl8L1k z=1J^917i}m&8lf0zI}VfAn$Z1vf%WFBmYHG{d=!A8rLi#7o(K>!}%;wPcT~QvLkK^Igo}(`yT^q0yTrb$j*M%{&b!BmC za(J`ijuS~uIcQtqiJJwKJW$*nbdE+h6VPWqy|;A21%722n$rUM_}=01Bt!dpaPZ^4 z3x^F00QVnf+Y=kji1*##s4Kk&)IH75Ep-Q@(o~A7oP`pcMGMti%qF}Fjz8s?Fn_K0C&}DE~IIN zV<%3#@4GnFIU7Y=~Qiv#9^@}>BQfc)^OtcakaQxaxv!0V?|U|PZf*w!e#IPo_YG`ANNm^=;uFWuX{+IC(-vNzYwJ|9uUKAcaZ z3aq)|uJPtGoof6TuqLX21K5(|9TnKr+V(TqYVTN<5$E^---dun+0UjeNSWM zKh^{R#USYwCw;f1hmrJyk=`28r$BPPN&YO!MI?D$Bu9qi8<4wOat}uC6v+N8+3Oe-rWKM)R58&3(*r0d{7!*lo8-zwtlpR<<| z{I)|LfzkPAE_p!2%53-bD5=m?!j%~v%gKD1i$EbONAJwzP@Py*j`hu_M(6a#WbCSwf2 zc_g(Ob=k+_G}PR1x+%Dq1usqdZyLAMfOGZlc$BIBX$IPrU#0fcci7ja?9&=KC|ZDp zT{JWoopd-i6LH8FJe+cEpmO)Iz^a+TgKO^ycl_(pp71G(+0Io=<>yhvv)P7b>OK$) z0kv1<2RvZ56W#rNe}mz!GtJi=srNRAr0&zf4SsNNS2o&qLm69b-~F4?F zKf-%F5lzsdnhE^fGSxe8>3CHF_kg*jDWg5GO7)hc`7`FU8| zxbse;wLkPCj)?B+KLS6b(AMisgn)kPze3(J_4k*1?i1fud#L7cyYK#k%iwL(WbDcb zYuNRWUyy0Q5-xvB)7TPn8MxfurRSaM0`IPE-xK)S4{-VQhW1}}Lo%UZ%4f0?~n}{PeG8e)}Uh{?<&ZeJp^MuwmIN@(3)> zbr6k!zJR_bfYy4qFWR!tmPt6y4{6^#zT#c*gJ5XtRDQZg5|8_Tzm;zNfUw?d#K-=T z4GOXwJcA@~lh+@|_(1w&kET7Rrl!K<&@snz9A0 zTeabE?Yq&4Ikj)FL;07?64M*-a+{CvzkD`$>&mrr)ZC!juXX+R$Jb!^c-gu$^_*?3 zt$k0~Z-?(1mA~Bk?Jc-}v2@$;mT;`LVXw#*5`qhgerLFi^n&tNXZHN1bH@8i`k5jv z&VkV`-fmrx55&(@%u9y-C1g_>9L0MjV2SeNleRCMc+so&eZVd+v~=HC#No;{wD@VR zmG7Dm+}@n?BrM#Fx*w57ya(iPmvIG zRl{+^MnCr7h+R z)<8L-#k~QT=Kafar+XtYi%*-tx08<0b7PYGE7K;pkQMKfr>U_?mC# z85u&52@7ud=YsyDPudUj@Ztr&PZ<%Lj3GRHyO(LY1laYj8U9FD#beAd-E2M)s9CYw zI<_Seyk%=N2vz+b1<9}GcN!jzaBr;^|M;dbkX^$lD_$k^&0pJC)#sFMDfwbaQI z3ko$N{v-8hPj+*0yj{~m4d(5!fU}Szp3`W++g!w0k5rQMWBF1`q2SKFDjv zpMIK~jq+0whq_-jTek{+E+C#eZSM{zFQ=+=SNDSaUeV5%v9%~yygbS=w1LVu;)r)m zV#k{PiKb0cGlZ}As8;QyAe?wt$T2J$186y(#}hP02q$xk>{!u6zIt{Y)q4WbJGIU; zJbu3^%P_mTgb;^!Zkj52?J|XOmix|AZv6*Rv!>&GUL*satmMP)Pf8G*%oW}Q(`$(I zij%%u(!)sl!ANfn=~Ey%-z0yQGmh5$reFO3xjLc`r z+HH|KMn0O;S6{ zfK?WHzU|D3fX4G^P*0;n^~_bEb-erI*lrh`5}f!{dipqiUU>NY=cY#};~)`V{o)Bq zXx#Muw_*nN~)y51ns}^+#vt z1os~IDF#bH=eKrbl%dv9>!hW6R(wL4)`sezCI(mx`lw!1N`dP9KVKh4YxkDT%>&W; zXy3z#uo&>&KM%U}1%X=koiXmuebHqd9~RJX4s~8<7C32h1ypX1Iu*Imh9vclt_U1k z1UvW%*4J7i2)*oOH#nLH-YMLp>WFjL<~YiyaXc-$ z76H~Wh2?LR7a~SFk*Z7TPtb*j2lMs_&^*N7c)ji=ccG(wH1B^_UjRy(CL4m~7BG9z zFLH2u3YuH%z9Tz#A8E{YU+)X>hd!0D-5$r9fG6#v7Dw|hX!WAu1-gPb_^@`YOZY%5 z*r zYRwUR@m@vt&h9#3C@*I_we2c6^2GDEe)j{E)4rQeEk+$ic5z(Z&nu7Pzi(!a*-C>g z*X&;ua|cj&%~LGVRG%*65Zlb1s$w*B|8jnnLKz}5`6qr1nZS{5)1;3S325$Ma)8Q_ zfvj|Y?y<7#!QeKgZOyahcpL=Wqwc}*YovLWx>JCUMNVh^>(91%%M|6BzDH?72J8R& z27S+lg#NvQohkXr|CDYN*;uY^5~t<{vQFVA{@rJ9**)(25iy4Jc0TXa?Z1V!>n>bo z@`%F6Y%e{xm&2PnR~DRC~t6<2pI|9$iAFs`FZP1w1a3r`+U_A9;{ zj$CI3G+*oH19tsi>jT&qXRi6AE*YC+8Xv|@58fSsu{`(Oel~{zw=*|)B-@05;Y+-m zn|~ytG?T5_o$d0tr&P$xYJ>}Z9k9!$(DLEaJ;cn@ee0C}oUIon%rlT?q2Mc*J(0lD zr)0sc_7YH(DHmEWx59^l-nRVuE)LyjHQ}pB6}(?($!UKwD-5Gpu?N$q0MaW?`ff=N zBk2bty)~pyf#iIX{8^HVNb6ko{S**G2XX$agR@pCxkx@;*zR zvnTJ;bM2R_gg3++%U?Z+gD<5OHZ@ni25NR1e0o~x&|!I2sfaEYdHhZltMj@5@&_KP zZ47H-zpM2+4`$^sjjDKT;emU^kqZUavc%)Suzac6DEm3|p_0>2)yo}Qqoo2fF%ukF zeC@pL!YbvXQrorE0(+FLFRLkeBO2&_e8(pl?2o6yBCIPs&fv?JluxZP$U>t=?fxCw z5n!dOi{Z`Alc2&mHRh-I2z_*~aKzYiVDU#^&s`ti3Fndn7iBO0A{>j^ zCh0lj(U+Nqd=Z^+knMa$ppI7$HQveE@jl!NhJVa8QNK*hJ3rcOekY&~`E<)T-zak7 z{va7{Nhk|clbM`iE@q*4^*D<>kzBONwmfn@Cj_uw%Pr6uk;VCLoqrGgd`Im2v*P2k zF+uT35p`1?{!Nj(tKvL)Xq*u5;d2Nqc7vgt(~!QLAFQiu*H>^j4c*n{+S*0@;83^g zWae(_j=nk2GWt&r@@y=k>As?hl^!N%eOfZZBRj;+!&H*NU@zyf`1@A)EhRcGM&cNL z7=O$|IJXp)@)TxlFLuUjZ3)|}!fcVTV)1NzxH!@V{C7MT#qjDho!Z1#5m@8jIQ=JP zH=KW36!py{8bNc7vPXYq!R-#7NV=^$xH{H)^buScByHh&y_8x^_DK*H;a2tMBe5zWo(+4_n z!lPaJfsjTk|9(M%EKF7O@4;+-@cQE;x(`KlA;nJVNV!KMF#X;5^c9^7{F@nj)F#Rj z78pfo3A2?0g^W*(KJuqwgy1cj+nGqgq~S;n9#`gQpZ%R~6* z+?PsKZ3wTIG>OM0>%uTrg<7K%7XWjUfaequhW5=U4z52skA^uF+=rrPs2uwMwr|(P zael5~PKC}Q;j#28aK?xmHidKl$o(sc>lB~(mr5~XzV~8A=RPW6^P_#gh+li~k=IWm zD+i2VlaFl)b$1JM(s_fadocL=Fi*|Dox(7Ui|EGhWza&|u|7rZb!ERaX8YGSNSqmK zec-tazE9m7E2ZYMjv2B{|8fI~9d~MUCYHeW?@dwv&RO2qITrv3z`+IUtle-DY~k6n zAYA5whtC+Ym-K~z-c7YeKLg9So8!6Ns zIdS}YiY;>5iaAEI1aMSe?YDdbRXF1Mn^nT+Al~*#{D#?!A~0^-cb#wE4;-0bOT`zJ za2@<*Hx{gtX|k8* z_-2@!Taqg9`6Iz1$tY$zz=U54Xelm_odYbX8E^C&{83=0;_bI(semy}8NTop#ijZp zYhHQsP@DP1<5N%h@M2}nywIm9LaN_mQU9GO(2!K?pc4xIKd(6HyCpr0q#um*){s60 zlJiaSXGt#N|M9y1KSze-8<4wOat}uC6v+N8+3O)y^A~u$KuM2IRnDdbK%&CE?n{j&h!SMKZNlsX$|qOF_m8nccE&}Q23{Jt7Ev4V z=9D5{DDCpL?2QAXuA3ZE?Q&6kG4wEfAOzcgg$rN3$prJ_-fY!aHO9Up$w@YX>7a4y zAxHFqtLU20Qx3*uRV@ARwtDmBqxkI^`-XreUHmz4Yu?EL6Cl18$2`>rkx)HZ89>`ng z>l!8q;gG9uI=kX^;QZV9FP?j9!GcC@M4&o>^KWu?O~~s&1O5{$KVLTk!%DHJMj$sIn~=Q!b&Px^*VN4DVOCq=$%OBW)(>tUx8 zlp=whiEu*v;zxot`i;j}mjHZ3SK`;R!iE=l8&6#2i2^KM)jC~bA?Wl{ZivXzVKk?u z!?Vu!jNlqynYj=;N6c<9cyOQl5v5OvxvaTxf#9?ctqv-@fd6`KcPq6|$FkD5GhWT7 zW2Yt8uR1w8ur2)eV-0B)EV?>>ZHs6iY6=qRDydR}w%2di>wK}s{9USZyYI!JeE;i3 z&uULX|4uhvL2q;H>D+I&D18&%SwA8*+-+Ma_`fwjY;HMQWpPyP;73@ntG! zJd^DaHMiB?SF)PPycYvu-I5x@4JAa04;3UUp>bvH)AJ{UaEjw%dt8qi)VitasDgy? z!&RNNm|Hf;&VMR^HGmf!5>r2vR;&khw`ab(mdA*%v809;eP_Yr;a?wXUE+jyhs1Z* z7<{L=WG-!Uk_ktd`H-n_HW=*V9=aW=7mw_|Jh*(^#TAIx>sQ-SvcR1$YG1S#N>S^N z$=Mwv50F6Le@j|}79g{m`F}fwaDU1B=_;j2spvJQBg_T4JH9jg>l=jcfQrgBndtVk z)aET+1L$nUjsN8aoaOP{4c|I|qpD5e^7{hhR-9aU*yl1X6AXxsZEFM@wA@>FIPJue zGd#t?$A#dTIm1&eN1RY{&(Fxs%YkVA-eX6D022{ifOP}s4N9jgV|zy`1v#2f<~x-%)?cA8hrxy72Xn5@7ZjQrc5! z4vyzjHg(TBqW1956D}J(*w37yIaxsj>X+>>yDp~&9V6t2u7unKiyRMhRJVD9Zwjf& zdHXXF|3UYP#NjN|d+y-Zov$V-Ute6h-7PmyTv9gUwq+e8NUu2QyCpr0q#um*){s60 zlJiaSXGtz1$?GCHG9=%C+})CUFmk7`>Hqg<$zB)PHz41^$b6Q}4aoZ}dCm&)n>`s1 zzXoqi7R~&9)eBxR$drd>Dd5d7e$P6b*o3tyt_n-a&gf{X&)!vm$= zR`T9g0moa;A9oMCgBrGXx8@@9fs0VIzu?0w$oDwyOWO8z%6Y-H)!M5IL^e@ylRf7( zWg*$N^-IhKrHpyCnW@$h`P}hKYwHUDe?L5qcdNArn=H~SqDD?3yUemtnWYoR^Ubj* z%|+?pUDvHfN$EU9$2DEa9x)Aq1+r)qzC1#Y0=$1}>`4T~;fSf=6U4X{a`lo{q=s`a`QtZ!St4XJF$YMU#-WoJZ(pO zg(1)Lq+P(Uq99+?o5OG*NZ`UsZWQ`VKa%L2T!H>R`&D0);RFw{!}hP@HJ~Zs?!%7a z_o$l1X}*g-5Q^VeR)}K12i{z9In?{QGE3LBVBkKHY>)S-ct*eO3Ib$t(q5 z>?!idxF(FF3!YmKZ!A;p_Rml7Z;n8NnOo95o`$1=tZygka0042R%WCZnvQm=CmBr6_wWoWBFhil_Nv#`KoN8@(rNny7Z%e zyW5vNA2(MDe?%(rFBAXm6pl18EdJ}yo;KoDWeJ=?@8lK7|Md+D8Mtr$dk1S#o$e?r z`he8GbaDU74Wz$!%7{?!vlVkPQi=P9kiIQNapjah&i0{sML%bQ3=QA%u>Vm5yK2?l zZ+VMg9aRG>k(X>Za%qJovUZSov48vBKd15#Pm_-Fu(mDmf4cNKM8pt_>pbHW*4~RF zJG^`KOJ7hjp~SE9R*(onfs`}}$oX|TUpSCf$B1clWk zWcNaGBs$OaQ{b3|A6U&3yD_VFjQV$G%E8z3^e1&t%})A0&dxBTI30AfL)Z_fo&Eaw z&8F?R&@#BI=42m5DlfX0tE`vOCzu;5d|`|tdxYIMW3rTzGIxJfNy`i_w$$b@yII2G z=C@I?J~H^TdF97HE~ntkFMTndX%(x_pszsqTehra5GjfdNTweYqW9l;|g*4B;7 zX~$M9ZusPd<{SfLI$Az)s&O+s%pk?z@;e&nr^Y@`kA27mQndTMSe5kQ;>w;GH>nMZn8T&wk3=aL zG7ajnZ#RXH?Y=)aDdrEtZ;NRcCp#gUJmE5FX;z%5%Gf#Zh!2*AZHt<`KTgmMwXTh% zTm~ARN|ebnUTEW<@*XH6g?Ba&s~_Z%f#Wy%kE|~W!&CM9uO49b0P`nmFK_U=BF8>v z4&f)}$oX??vyRj_#rchbe#IpYe4#4w6cO^BQ1EH!6E9vSXdjo#RPYC)Uv&0$7i|rZ zeh3S*s&^O?dA0W|YI8?>`Yyf5*_j9;1FjH4&#nR;N?Cdz{Uu zIR2fY6$2Zr&w@KcC6O=a5qz?e#rKh3J7}9C&sDVZ-Vn-cEbsH=-3 zhJ!IpT5KJqXB&OW)*ZtXqaQAu$Ndz6$kn#NN6C_i;SrB~Vud1@EwuaDx=#pbO|DPD zQYWGys7v_Dg-A*_^NZ)sO4OfwZSe4>*hY#d?Skt^J260d#Yx{S>0u=OV5GN(^eK>> zZ<0StauG>h7s-(!`3B_fmfVApI|Z^oOZK|Rz5)3TM&`3*Zb05=$#eF4+RQ{5R}T@d zHWBwQsFI+`ex#bFsQ~ggY0?k7i30hEHK7q%K2-jsJ}uhE0|aZO+=@8w z8DMU3#P5EH9#C)+iQZYQidKn-ubb}76VZ8ktrCY`5``N-svbpnfVX0aHAm^rfGHvR zX}=;P)Y9#%ghQ#GOf=!E@vD*$wT}iy#)z`O=aJHrx*N%atnSNc+fbkTy2+W(gBE@K zTul?txecWJwkH0{_EXmHZZ+0n?jqcbzaG%zAEOkie7VxTYJ%4CU2gPb{B5c03;Sy7 zponxK_oM5MvcNa?&WWEei@2VXELnCxiqc1zKNjutC9*ZC{&mYzqGsh?h5DE%kQw~8 za#n~2J*rEmK7y>ZoQa>w7%-&;w`t84HZ^=z_Z9zHbvxP{S+ylAMW>=upB0|Ce9v2U7m5K;f>gF%N9DnEVmmkD}L zu^d_YY-_ySlD1eDEVIe`thIx7fnvnqLw~L~fC-(s_=u z$#&(&WPCr7{(0$Bz0B_xp>1cQ|7OY{)H5>Psj&wLDm)vq4{E2lxv?zl&}4F}`78N@nnyU)WrTUZ5h2^y*>P`WxoYoj=b8tm6Y)NMZt>Td}+qVnUz5y2aU{@rL z&Z0GcOX-$&>dxg5^S|5xi>#+p3_D%WrssS9cg}VlJpSwJYAt0je>A${4-dGYT}j@mjX*geKjXHqsQlMDTPCuK=DIk*ApXrV9lIS=yx|KY7{AtINZ@n>3{la z+Hs+dh&s?ueZUx^c&<3-YkpKkYP{^ZclNQNFyfX<^!pv)(nWUL$GH;dL`+pqa7Q%7 zf*X`Gf3+pX1;e>zZ+cPkbS*+?b*d>H5pU}Qc^<_6?_mON*#aqW(Z z3#unl_z$ugHf%y{1zKqb;)m2(b(c-n8@W*P!tyY8S`T4CxULH(j1hb6mnBZENFi|+ ziU~jYHr`NBCTRdHT7+F<#&_wjv zw69M%Kme!d^Tb#8D0!RzMts_zO+0aUdbO(lHRV@^^dZ|;KBV`$j#u=J5Xj2?$XB3w z0L01;>)hGOibOfK{+4*|M_gbMYc$HTAmqHQTVLj!Bjk+yjCR)EA=q3)kg4HJi}0my zt0z@g)r}*c{`hFP(Q+~3{jmG%Wpxi-UGDRTSJbN}v~x5O?H(cYHSA{4>9My2ZQ4+zetSD*HQiM0?b2$?GmA?HhOKuanr9Dh z&HX;#;;!t>)miA!ED0yFk+WIA?`2jU?KTZy|kNOh(SUaSVr^y2->%% z{(Aa(?@8gG>P??QKkkt>LwQzv4^`x!Lrtf%xM_W;93GzZq!kGRH2(fuE?v)W^-9}B z9>+`y<*nva%Dz4(5V^}LR>4~dl{sGZD6xJ=@mhNjVJWK##vPt-Tm#1uhrw{E*`ry? zz~BCR)w+5>tl&1Uj$9QLwJze@5|Kw_e>|n881q{_{ciK!B(utv&{Mbn9`+GJ8^Up7 zf>(MdZ$Aus*=9c=&Rp)|>j`2*+hMH3-T_fy-S<6IgJTPjEb?~z9ZTi-J_yKGVk;(Q zw12haSUOOi)2`DbcL*W1z>YnXhc(V^Y<4kj*trP=F`}B!_9>(nZ0*U|p zV3H-@7%x-3HO32#+y40!?#NoJ{^NWdVrbt>Ngu9@HX^6S7lg`-l9el^n;g>Rn&v?Ij;%K16~>VcD_+vq*VH7_-B|tB}xkt zCEnS-qdZ8HP15`Dim3ZpGQe0b0Y(|Kq})xVf#(j|w|glEz}WIYGew#itZ6GMahX3M z+(s;IkLQ+9I^!%jdWS6t$+DE!>nbILZi6%TD%C&zVU6W)V=F!WbXDw2%cUKd|9odY zU7!#i;Sy^}t)<0j3qo?8a}5M>kBL#wnj5Gyx@KHBG)Uw!E~GB0tq_*)nhkp!A)=u| zb!7b(2sQ2Xm1UV_qE7VP`)&48=r~2e{Ef*^BBAa3%LHQ~u!|#Fud|!#^KN%lDITB@ zMUlE*{Mro^g``zz@aaFw%|8drUsT9~`yWOQBm{9GH@=`P#q49mOrF|iL+W=xMz@{( z*U#gW$Jq>Q6DsWJ2hV=yX|@%0?L&;6mJ+Yj^Z(@Er>PAjl-@WT^N@W=srvo=r>nUi zQFczA*3aw?@pi{@1aVdl`|7w~P14fDS2v$$6LeC+Q>XuyES+M<;j`x?uC%UE^zYPW zXJ6feKD$NEevlj>>?OTjK2>bP_Q%(MUHY{L&E&k)SNl9lNE`e(({9g%FC6Fac$}_* zcqT3O2}zDp?p|j5uJ-t^dZZjde^=pFi@s6Q7sa0g#IfWdPaBs$qW#OyJ7e_Yl%QL= z+k@%F(1CtB?G|lX5I!37`ik5HG4gdau<=wIVeiFi9u)eSf~pL7cv_cRJXddVnix&0 z2QZfhvESwdprf$Y$nG9xMXNmi2LmR8c8U(Fl{F9=(JxJ>i(4rNEZA=R^W7$t_j2E% zdKd}&!L$GTV1@p{4*$G0PEEZn|9lD$q>e=W<9y4^K5h8NpWSs+^XJ-YSK+XCGn)Ms>y^MukrMCj$=IV`-rKfQ zdtLBTPSROw-vEQ)1eIgI%n>C+%bl9fLWYATH1uJ7K-o{j?zuWUsD5F4^4~ej2etmw zxX?*xi|o9y<99EmnC>L?{{}Bgxu)|_S(_Po2Wa}%*nKCQF87GuZJQ(t&cu(gi_TDN zl!o>ja;>-U78C=^Sbn5*UDCEe_b>5|+x){pxj|w{QI;mlZ-R2BE_g;Gg%7E7$Ft!( zuhp;nXhNMt0Tgd`Ek9?#B6C`1uu zXRkygtBmif_wRQ;|3J4}x43$q>v}xy_c_wxjCUbt962CKsayDQ$0%~?oL^Jj;w!B9 zo0Ij5L_HGxrC{=^Uk&D)+8E#HGF;cKIoGA{$OF_!)P=I6n{`SVmQkVhd!e!S*7=jq zs*q(l(-a=xi%3?{ekYl)vY0@UZ2!aHi2r%T3EwT@VI=%ugtvz9DG;3Rq5sdHCAf$L zuZ!Tw5PSoocT4oah@JwGpCxi#MBae7gAw~#VsAjaXNfsm`jIKTH>M4_rQjnQOxKQC zaVrRv#f!prbvt7k^baZHg2T(t?SGM|OY>@+j~9@WtPc&j-^^hhl_6%~JD=;!HlIgO za}SUL?as-{CQ+lBONL@MOn)F5&&4nFuuWiM)m%qaZ*SIB4A>2ew=dLD=KSrTiJ*dc z^oI=Gf0tv2k#Hm5bwE@UutxzUA?qp<2-^jhyDiO;a z&Uu{ngB#BfmuQT$k;Zr1ehb$3kizBWehSc~u4CtJzKHDJHGud9Of=CkFaeD)X)h*? zr`Ypvk**JYHxZlYfa1K|BXB)-koL_=16EtsUoe!vf@Ds&glyR#hAmUR&hiqSNP_jJ zv&}iHq>qVHt`9D~sbgV8n!CAc5vuqUhW@|hh`YE+PpH8=EM&*oK%GtmFo}KOA7;i*N){?oVpUS8_M=5vHOI5mIQ)uwS-%&E+feh*Mg*$>XvJ@se# za%=k$a@gdD%we(@?bpKFox0PxSWVWavHiaak-xTPK}N1ZcuZ~zqqo~(-1m6$d0HPy ze4p2%4b|2LwkB1w+TA{b%|uGisLZg!Uu@USdpYY6`xP1cOIN9IP<7UXRgWJ$bEE2R zgjL9-<;KEno~Dqe1j|=5t7)kX9cCd^uHH+iUVHH z(IIF$i6QOOUfL2`;(99_tzju;=(|wzLik2Mk+y>{jB+I+lqR?29^tm!@cP`gj zU`Z}!&MmP}NL`3*TG;LA`w7BU4TZ4m4i2E^RG{KM&kB7uJUV!6KVyY2ZcHh2Ok)%$ z&uA3&>^lphh7|~N8^0P#)i^v-gcQ9faXA4y8`nqRlEGg+*^SKmxwGo|B?8V%{tx=GD8N*aFnB4LXq@{ABj zD(&6+#rOxK$yUk}YhOTAAEvV3P@q7mIQsiK%opoYUuZvYw3Y?T)z40w^~-@EJ=xX* zqc>P%mrb5TayznVDtg{Ry$9R2WonY0<^*GonVLCf{LtxF!u@w~l5p|9 z!NyvdWJ~Ca6quB91Vr-vIeL(y^Iu?AU7{VUdH($xW0fw?v7Za>!EaDLV9848e73etDi?NhOQk5eqsm&p5_u&a{8+R7-0XPcc+l|H&jt}K1F)kR6J zoK@-)tB9l8o>%Q;&c7MIh}yNfG7|DOllfKsEGC7SbU9S8qY;HwiSv$9@Q!yOuK#Km zKGQ(UXc)2&t>2R-BsD1yr5^Y!ZGWQ1RgdZ=9)CnZ?xj3D`t8aZNw+w~WLu*dz&s2;doLdtvBB8>tcVOWr-oH#*^n7OMYMg_)iQC(DmVC$T`Kb05oR~!D zM~$zyto^gIh8vBj`I$bFAhzHDEZ zQy^;qW%uw_evE`u>B#YudmP}V+Wt1)m(g%h_|pfi@DO-~@)`M)D-9I8o!?<|==i^^(Kl)UB2gbz&nY4`;l;E_Q8A7qXUx+;D~$m;xK zs6b0KkUoA57*>3WA@{-1m%qDurdS_>)S_=#}x8cje?o+w>r;%`@4t1e9oCk!3uUb zb}g$wcGwl@@45f+VtqNXD)3gqu^=8xitO6hSI~%+MC~(snyv(k_V_H5oM=EnOK*DZ z6>g|?hpDNbQ4CxxU(u@^$VT+?t{g>EJFr)@qcQu*o~(fbah6eOZm`^^;!6F963m~p z&^yhq0jPc2+?n1FrueG{iXofp8^#Z#XS`tC!$R@NiFIunu2 zDgq3GOtahG?~rcOS^wot76W$icEO{)!XWb$74Da@h&_l+UAq}MiFE4nQ=*(Z*j;Bj zz8QHP*r9EmCg?j#^00E~h{)7{oYl(8&Gg0~tYYu6g4#0VPWEy{*RC9FlV6Rd)GryK z(28MC(`!N!!c!ip8V-`YZ`_n+{4EKydyTUzjRc`*PRp}rp`74t&pm}_ob2Fo!{zgB z-B-XT&Z1*;PVqo*{lUqovIJ_+bIk8C0_9|s`~ zJ~fSFR&aNAQh4^qp9poy`-4^fp0M&yps0(42|Qb6C$Rm79@Tp{ZxP{n7AC7#MC5n9 z#u7Pv`7FwEkS+?{rRAq0;I*cku3EJ+n6oUFdr74Zn6A>C@u?Zds#>>5bL0&2SZ_Ri zTqQZ@K9;x;jt!Yl3E&O z$FOO4MYi?PUgSHKNzFdSJmfid)~xFQKVUI)FIc=S58~EGB`FH^KvnEBADbjo@C2u5 zQQac{ytLTKwcgYOj^wZU9^3#Nd0_JGb@2%xSmJnyrQRJnEU#X3rH_ClE^DUwkSzEo z04M$Aj{!QgrT;kJ_(STB0HgUZpxWi>pMUcP=%|}ML*{kiG1f!hevbwNrjN`AL)VR= z6-IH2+`GkBkM{Cjc^VC>!&1`%zMX}6Q?#t){4Aal+4O;1R0uF%?f5TmaAm^z$iF+7 zlsV_IW#c|F~qa8}H^nC#^H$0bb>IaaH@Cf}Ph^Yajdi!Opna zIY+)hOi5A4(a5YBF`?^{JS01Uy}Dxe_%9VVcy9IT(D##~z(e<`+SpkIAV59OOBp5% z4A>gf-p?#xbCw~ezlM(>s{YcM0^v0HUzH<$zu#LzRg3e_u4(GQn?a=o7h^Smbmo`* z#wb0gdPCf_FY*-zdmXYR;*yY0Rs%}kWYV$GvySvz#$T}4DfZ2#5yD`6_LpT&5)Z6n z=v25ADFo;JhIy3IC83em`fBbW5$F*P$zQ};;JLudKXAnEe_nC7)&J+aB|MCTAB^zU z5IzNh^G)z)2`(bR>moQZ1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c%YUhNc^|XQf%md`No|VCL?*I~l zjAF+sMZo7&%4eVDR!Qbn^-~4$lF+7cGplb)6g*^;mGg+7!bPgYujC9|`ej2CUaoD2)Zc)#^GM@^7Co;Lm z0^V`qL%FiA?+hw|>{!i$r6ZT%U*2I`IUtFzv`XFFr*#X2l_teHJefvn{4+a$9_Yqi zKH0V2_<#f^xawiuH4*SEq{Ck_?E2H<)_obyVmM(?BPEBjo;c}$mLsLZQ4C3?cU!M+7<%?5A_fHT{{Xz zg^zA&-JHOv&Sk5|ajQVd?qTgT3q6QbInf5BP~q>m-NjhR-!ZgjT<*V(zXCNVcU?)Y z*pDX)vMkq^ssXXsy7WC=ofvBrhiuiUH^@-j(8K2^6@c4j923hW63k%NS!iN3gn6Ls z;p>7W>`(4-v4q%hBy?O;)=ORyn2FwhJHR3dSPamQkGb z4&o>GAtdUoCrxn+*TVjPUh$M=aqEA+TMp|pqyIdNd)H1B|MP=QNtW7?y*2oarTLtv zSFXc~{qJJY1TR!&B+`t`pT$q-#qD=_dK2Eh)krI2l1+6p{O2sB_%#_+d65c+XFDgq)v6%iKDxcgx5D!DN2X7q^^2^4Z5e|L$Nv zNS*io?Pv8ZJ{Y_v?+qkJZM5A)0kGSAGjvba4Q{9UQ;$MfG}+;;Li{&YbkT_S)bpMhC{vzw21RH4Au5X{`P**3%9#;y$>tD+%#_?A&3)Ral&^?co+#k7~!oUdo41WdK4AE z@QPDIoC}RUCrpw2h!d9^+stf!Z3q%?M4$WAr2&L~*WQaMRYMn&LZThWoH1c%n zn#VjlFN<$1*HB)d9YM;qyH|=54FOlHVw8u(Hge;;r2?m?07}(*B4zuBGu$QVxUqPI z3*9kGwf`n2hz0~)VcE@M3U7>1y*yDNjB3OPzm|B}SvUC5cRs=SFt|4|d2)a1EMi=` z`Z41878}{pYla05tHE^qOE@y2!1gY#sZ(L}hLtjN+xvTbJ6+6^jb9l^* z5wGGtaw^735$=>9cNV#(2-V+;eUtks4zv<0F8e1`A*7bPRoaWwSlH_0Pv2M)u+s|+ z9Q~u+h@9AW=5k>(oJq;L?@GNBzC<4+D41lAyPlGjXnvu9cb^QrBFw>#Gw$gv}2ll6eFFcLVW~x)|WL_8kYwXW)N6{k4v-OR-e?=dy zKHIfF%}5pha^8JawlD+E^t0-adkQF%#nOUDhBD|ndZSyyRv9WfGba7Z8zeJWdj7kE zP0aeK_>uRs@!3J)v8^Y_yuX;cPIKzeLE7B>-<&$c27RsH($6PDQ09uRX{F7Ksj-OXyw=`fZT}xvv zK!Nl8xXxD%Svb6!RwsNy6ZVx!INJtyA_cq=D`M`iusZlY)ImK3YyUj|pd#ie!oQGv zAm;W9EZL6YRC$LE8g#!g&eensEjT`x)orSeQkx{oCZs8$gjbyK-4Y%~!VgAxYY3kL z!TBcmvji8B;B^rk8G>&>^lphh7|~N8^0P#)i^v-gcQ9ffvpIO98stjXQml0et8Nhtx-+#jMz9J7P zeLlfAG$42?yOjkVhTgSpGmS|~Fpd4N=4+q7*oM!;ALpwVG3ghnX*Gvtuu*!s!a2%u zL}9=h3)^R;Fy{9oYU;O$Mj!i0 z8O|(3KE30|gOinr&ySifnQLX(d0XlclK@Q|+E>a3vRL41HSg|gelx?l`26luT;#!J zyT7s6oTEZrt$xOysZ)loURei>MJBLU4{mYjm1@3-c;oK^snm- z4qPaM_Hzx^*&z5Lq=74}u@@`v<_^xLR0pT{k~zBVWMRn65@q)5ex!qGikB*T9AQrn zUE&=7fmyw~vum#51emcqldOZI&{e7`F!1RFqPY*mnBAO0_^+@uaf)qV+?BP43f!_# z>4qrs{ficJBZY#J5w&>s{2fwzT8D+z}v~Cv&p4*M5q&nU>KBWo2zUz!g3K&4v zxyP6h4Hh_MeJV1p|vXtLa@hOY%s5$oHr=9 zFOm>lM-J(G?aMI#h`s;wIZ>+T5B9Z^`TV*kC;02k7s-=&0G8NOX!-vX1}}9@NO@i5 znB_~fFgWE4GEsT-@m<&F$hWE5q2R@CEY#twq1I70^pL3Uf;c*eFo{n0Nk3*q!CoHA zKS9*!)nJZoJ6nEyF;d2ihwK#>R}VX+MfTmA3a~VNPgF+r8`AiMgKj`)fy;tot6NCU zZb^m=H6?V2;loS$N*?GRn{`hW0;5u$BTb+HpVZa`7YCM#Na__E7dM*IG^ce zi;UZV#h}daKX>P_sB7GTTu=AoyZ(Bo7fg2}`;5Pp4?AmtVtofL&vOW5zI?4onokO{ ziT2fY>%B!RDn(EBW_DtBhnF8IdDLL~6k1eV{Y?l#}O~SH$^A!iiTN7)B504~d+11mrCOI^O(eKtoeDj}T^jG&8`ke~LDsa0HZKGSr z*NXl2j1LX*ox2xK+y7$1m0r5JY8^Dd{fa}&vyQ9crK#$-ZhSIB39mTeyCpn~gddFX z)(}1gg7Z!AX9+GM!RsP8G6dg%=-m>1FrueG@>_^ZqJ!1D+cluDJBXg`l+`ej@Q``w8cNma$O+vOvF zc5ImRvmn+GzOk9x+>CVaTJgR8B8>j&*|m|Ypn~df3YZyLX`ll(pTDwqvf^kH#Yd&y zFWBtLE!s0}(!lIzcbsHa4Z`-5wwBgsH*V+5Ht#%g2xw?s_&&S43Tf)O`c9{Y4zKmp zoB=Z0kYc(?Kj&}?_AUz16MlLWTE%K5)xV;EmkkcaseU5o$C*h2y;0Se?_Fp**42%a z&>nrz%PRppbwB=0Zx;cfAM+ivf@iUFk9EJl$Y?{Hk*wKY`P0a@cu@d?ugrQM>T6APla?tgf7w$Yns;v<;{zd_&w2AxI~zUJ_|$gM z*rB!VkwT@=TaOtmZv=ZCyq6NsYZ3sN->A_24&uE>gAW2%M9QAMfCt(Q(6J^(H(@`r zr7Le8qQjlqqO4fsDbbz&ewW|+{J>)Jg0ZFP_e~A1U#&kXk9N zLQ-r5h5J1wFecq+r*>3_kUTO3EW9=eQa{9bUmI5C)O_HPZUMLBlX(B zq353gp-w-ZHq{DHxyo?=+}AwtdhokL;&cogZDUd(dl=CY$E4%0N|WI5fj$^N7X#V7 zn4Xb+3h3hAr=K5$B*X3_e+{j5jG%RY`h7AN5fxI}tz)H^4v)V|cDr340X^fA&B%NM z^!>{Oy#&o@$k+Z@uIxZSZUr00LIJG-~_h;ftok|L$PM z?{xp$&oT*yu6$(CgV^^)ir=N0F#qIqv9ysM&{5JayS#4~}b zTz7E}=lns=F*-!QE~<9}JcdJi!n`UG!S6RuJI>t2rs1tWn9h4_s7wp>Z#x1@Rd%(o zqZA;i$TzV4CnGEfn;3^Z!eAmlKXF4S8#(RlfKI7=##rb^crVZY#S|rupft~%p_#0T zL(Qrx)1mWDkGHZol3W=>-1Vf3Kx2;|^J_ZO7j94g!Q%obcTe9!A0sMtExop8~=8 zCit@i7m?s~5gZwUZ$R{Ji9Q(7Qy}uQM6Qd-8xVIeVn0jl4T$$FF=rKcj0C28tg|gllF_@^u!|Mc-jdZ;<T06J#;I`~>?_ z%T_r2G6gYSmyG&0`5svhNjiNtUYOjctLaU0lZMe&#oEzHr;Z(8pVrI<#MrF2+Ak z?q9S3^xF?=m?~x9y#S3-Q=TOx^2ndRr-R9Tq6w|-_qNi&`h)g%#>{S1p{dyCk`4Jz z!H|(~k?su^eYfao&$lN?x1D79xl?i=$-ihz^ePFI2S+^>)FOfRJXxP^TTNo?Y46uq zZoS2RVjCRdK~C_YqsG#iAWM+=vv%m{wh3sz^&-_$x*n0qG_d-!9g4iDyp+cNF&`TV z{G}&7s0ovAM+c?`=>i=W@z`Jcrb&4gW^qfsvM}$r>hG}{5BMSQ(W^0-Sw3|e=K>4$kP^)B9@QCxeai@j=?oo7% zJu=<~R&3EexK*1B_Hy&xydHWOFRyFY7$Z~W-Y=^(JZ_%HDlF8?PfqqCYNc0VK0hVn zg442U-ioNg^n|vC##jVScI_2)kYK`7gs+R^pvNH~+#}#2=XAZ60 zj0Iy-)%TG`0F1>2CrUp;PE|h0jAQC{_EW` zc?bR12fJ^U`ns!@1IA^9z`b>>(0-SE-?5WRb@%U3|CcvV>#v^scL$@%d|36a`!Kl9 z#IWEWXb*ak!q7oS6R2jX_uriT)_Cj>c99upzmz(gxp1ZayG)sC&N%vmBuBKI2*4g8+?+5NZXJ z1HigUNi9-+6}!|>VBz69g9vABT~PI5#QzjT4mG6M!*eEU^AvGru;lSd8CcW;jhA)& zr7X;$26gA>xRW(lD;Z)M>;4E?etwzd9B&TRU0cV=n>I+!Cj|KTR!M?C0%v#Xf(7AC zq=dh^L;{XpIAfBvM;`KrQ=wDmWMTVDYLAVZkr1Z)?>^3}4(>;4b8N-mf$kA=J;olV zV3e}B3{y%1;Jc!ENw4`C__24H>v`gf|9Qm;-!0)`B>Z56w}$X35S(v#sQv3a>p<^-q3};g7IJ)|ncFr>Q{Pd*3jYFl(fN*%O_k3i}M)+;ZIVR%A5ykz2 z8=qIDK+@MI_sYL>fL7XTd@rxb!^lg6`3xVWfIy}5gL5t7;QQ&(&TJhs_^`iWSl#&+ zpbwpqC5fd2iO4xG_wC!j$2>#DT2YBk^R?;B|!PU(+I>>g>BdmYuI4nde;58DM`G-Fn<-E|pCW~O+=7;8?3Cds zGk(lI;Z8=m17Ow>D{Gbef^B_E_;BZQ17dGcB3<=F3CO4QO)!pY!j0Wy$&`}wR_;^>9rS$Vb&V*V9MR{gsaI!_bLbC_7p5DkMyRJG`<`VkMSL&J zysAhd`yc;qRFi$TXqfH0KYqgpaqZ;(7iQ*fz_$AhjO?vJ*VSFPdjiF9y4A}WSufv! zbIjgcpYZ@R;QDYZnTv=n_N{d9Jy{Nlt)vvm92s<%cCoG(`ThWf=C_1TIDi*zmN)+O z!Pq2NC;#;n9@y(M)RObFwSJzmTzAd^{;)@RE}nsKp@q)--yJNThSv7qewKb$u;(Uu zZ!j`3;duV%8NfiFw>ohr0Jb~}(_4M0fljdO=IN2)M`cQDf42Mn#b)~0QY%tS;ojqM z=VP@HD8|EZ7`I>v3LJ3O^3hfA7kfp(Ar|1#Ip`BuruAzQ5*%$|%q~RDPkHmut!W$|-HS zaVvOOmp}b&iz2uczgly4Qy=CmoEzEOqz69?^HJxKc97!Q%iq6R*C0x&k0|X5pCbz* z%rAbiR3Rz~xq%a&*Ad=a*33s&)KP1ZR|@o+bSSf>P-NwbI{JE2bZ%ur7Il+VrhTn^ z92W&wUK+_8;W~TmnS-Qla5n$nmmY2Ep@dhQ@ZAy~M#2w9cxwos0>Swv__G8Tk>GU^ z92tUdK=f{jJ{ZwcAo8>{45 z9nGY}y?L#7D<4=wWUR-k{36Z3h1_E<@|Dh@{Jfx#a_1?~QL}FxHGU1&1aWmFO?wq4)V;~vMorM#@p-4t995lnVMCqV1R2V$~#;&JB}+S-#hYi zj~OmTvskL~k%U(V9g#YJVH-)QtMBaN^ah)C*)?;PH0Wi`-6C=G;^=Jn82`|pqfh}|#yEuxzV-L_U&Z0?}IPaWOY zDM!zS|N7BYph+zXhgP&ns?Ri_WBwPP2uEq~Ql-1gn63%ATjcRB;_MRUoOtlblaWe{ zDSY-&;O-G*C-$k$8p#|_I1X2nJ~-i4+V?ixIc)J9g_IpV6BJ)x;!;g=;K9`zH`TlY zz2M;4fb<^nzgNb%^7GRh3b~X*rLh-uU^@~&>_TE}s>wrBaK0}eOR2Ov- z_TRKIm>IJIO274~>N{D{+s!t!q0JkZpKHmLp%#Sf`?x3H^WfCBy{4%dklPg0Zb z0(uF@Yy&>I(}Jx%Z0;w1)eYaWz-22GS5K4d=_8sIZ-qF!Z*3#P^R_;qyx_v+Oi zGUpr3llsKobwvz!yRqb9db|-Foo6f{^SaPU?W2yn>|MZG3gcoIy$$@%&CXs$?%lRJ zo=Q+A_rV-Ur&C(UJ%!x6%uR^oC15=CU@n(V9a0?b36Le{4N7!y+9L8DjG5sw&Q0FW zstbi)6{EcZZcDzqU+n1vZyjhU{x@d@)AzxSb{MjTD<-h+b3A0ohAGF`Gq;+s4fR zWHpxCKImlwC5NJb2#+CDJotPw{emSt#rQKu{c11rJCXHOcSS$Oth8vu6 zmvR@FTtA0rnM-JO(0SlNfortokDc+jo(E0L+8-d{6(@YRgolyvgAv{u!lyuRz6t&; z!9^r^T?9vl;2RLVTcQs}^c0BvERpLX@&?2mjM&c-djsM<`@cE+G0??IUKEwR!t|mb zc?n??THpC@U=D8p(=p)!GqAAJiW`b^0bh|%*Q&nFVr=|J`Sv}p#e8qmwKP6`jqT&T zYrOC4SM1Ef%Yt`@wPD_5{Id&%2!LYYffe#bAj!vLzR`phU+*myQ2$1QJ6&(BoxAuQ zqwW_N4f3eObnB(B<;*fe#ch*^CAcnVHos3f$sh!fnkOw`P8LvgmX*=0@(ZHsbiX}! zwhL(xJfV8!KpC=I*zYnkQjM%T2nbcVreSX~Lmo}>XyJF(=)Q=Co8vWtSDu&tF~XIu zCo$jy(l~?Y`F_JxS~Sa&^QV~CdGOfx+V}O45iDWzLCy1Z0rcUR#+%Dmqd@ebO6GXe zZEWUmT5{5H7PRP?MCOCX8Q`LJyXr%VRYa6!{tAzp85~Kdxvj=52JC}uNIkMch|r!X z+hpo-WGORiUWs`glg^$kcPHn`G?dh~FZht(p`|ye6zzHu*?Y9h#pzQBU;DF(qK;K; zIW{br^w|JZ|8mtjFUkhf&)iP_x$6>tAD~BKSe`=&#ty7t6g` z4J}CUBYgk^oV_;vLH*9R+ z41cPQ&ZW{Ip~gQ8XSSd=&} z_IiZQbCexljBb+au~kBi7AlXh7U-Zi!Wd20viQ-2%Kq~5@`Ly?)%R$Q+aDnIT;N3B z+F@L!(hnr69mIwFZOz2HTH$ZKv0L||<#4bNaO;|r2;L{8CNR6&0@K04hSvjfxRqAa z82vUQ+JDiV*RUGGnG-Be#B2p|EnOA=52sE*j-VV_>ONU$sp%X({*P}U_{IDlxp&JH zSA2|Ze>#$OHTc)t%=1u*A}(xSloiN%J7T|EvjaQ6M6c;b&KqQ|J$zJ2zJoED9XOtP zFc@;ZDu@YxaRvz4IM{q9zh_H6-*)^rXLAb_xy^b@kqagrqqpD4K$=D0wSbfRq1Sw6 z!dl5ONRgM5OJAmlozt^#)?R*(mFPZNZ9eo0OBh`1U0Ha9yt1(1C>S|K_A7OFwmMpa zoo))dM*{U}FRVkGo(#J7!lp*VqP86=OU$7sviXBUR*;x0rt+s5H`;EJlEUK601-{39mTeyCpn~gddFX)(}1gg7Z!A zX9+GM!RsP8G6dg%=-m>1FrueGWx34)+ytXHs@@@SIy21r) zSzD$e#J~V(%H}DKq2{ocENID=GKOJR<|9-$CotE&u{XM%S24@3n3KY)^T@u6>5ahX zb&SiQmD~KY5?p_%Sk0w*1WGFRO58iF2N%-Cwx|p_f#5*nR4`K<#!(czN!8ean6aAn z55IYWh#7J&$|h7}Eg$KuQH=&fRYa@#RRRNvbUta#=c=QfSZ|Ki>Zt(}(!#lUS{xA%-W*;9=oBoYq41dkm zU6!$e%PX7c<7D^%-w}wk-ocLO3do!}sR#T*i?y5wRN>F5`npFf4VdAJym3yeTBNtf z)I`#*4Jqon_V7I0d2n)9^xjjW$Kh1J$3U>gcWit1M@VkyI8t`+hZaCaknZGVe_I+u zC}`(o{Jxw7qSw=(A2~z^9*?n5bhDNs#gW<3!39mo`9Bx$zI=*f%~Xe_DrOom%u+w) zSC$#N+W@`b-r$7NziXbMGdqF4nH8-6#?6IRZ+(e5eVQFL_UDPSVsr+y4=KknZ7Tf9 z5`|?Diwt_RS*F((y#jXBbc2-=d2z}9a~9f%h0%{sPx>D9jR%ba_m`XX*zolE6G+*J zAux}TnRzRw3d`Td>M`q$V{eX5qeaCH{IY7#jRb7Qp(V6PR)Ni+zTp|tpDk@ zu!Erw1=yr?E&;<2mk-l&L;;r}G18{}Wgsj5e$>tJEVPYNxoCZg5x>YRJtZse2cN-- z0_k1k|A$W1{pS@I`|^#M;iETX)Z1O;e(M}O^@#gFKUfbua*geqH#|45y7E=m30kJV z{*Uv$T6=sjaK;*bS*&?Z<|1NEk2vDVye=f^nO(4d%VHhs>v-u@3^mlCLOXl=i%Hk1 z6xzvsFbs1H`PWlGa#}TZHo5pfOo9l5Y8*4vI?+UuvtfoOW#o2~?_ikC-z^LBeim8e ze-<@RpbMl9Uo8uHuLfbIOjBN;B;4QH*vG0Ii>*!Rq@8@_j@)ppXj|X9k9@r(6m>LX zo3zM!+d8tQmDH=vvNS6;P^YlCU6Z!wZ`}lqkzkKy7`Am58~*8+i!7Byo$JhRL#p*t z1!WQxfUoCkd4BRj$NxkBg2v22s7}M}49_#c;oJHgQJ?Iwa0B!*8`lM7tEDBm)YujA z-I>($Qhb3>*?62gabT$~LicIOSQG__*xWd>#NJ2RaJ#tT&%6t4>h*ZMiKc|z4bmN{ z&x4^w_bDmGi{??-7WrzuXV_Je9S-*44M#RK!xHq}loX@F+vGIhdKGH}1a zdnIbt2gC|hQhhU2McZfHm1tWo09h{0-*PtI|MQ9yzFWe>Nch1BZw=v7AUNLyf0p1P z61*;gBSY{Fh~6#H2P1k4M1GdYbrE?3;todaXNkQ5@t!5-?5}h)wym8~Btr6|sQUMojm=KPjfs0~x>pFaY2PFr1d{X=Totg(7Yhp*`Yd{^P(P3_O#%J$dWJh-DOg?$PL~% z?dx^<`62aL5p8wF;n7tTdGerbM9+Qhpczy#ky7{mY6b4fxxTFCGlC_JuUCI3uGX!% z74MGgF2HWu$T{ON2ceh&w}f#XEj)48QqWwy3G+G-aO&`bgWxX1neI-OgXHh0UruW; zhcTVjKke+03eIcLJAD{VMj~z=psSp|gjH&ovjN@xFrdgoN8$tx)YwhqG^PE$?zZ&) z)=xDx2-MApFui>8#99)L+-FOFOY2mC zIXqwdcH%oXd~KkqFg-G& zPkko>=3>>f_41Cwb-{f#-X@iZ#a`Ac0V+k<;MOQdsq1Cz)uO{9mDNg}TAQu^gK-w< z;1E@utji7U<<#a3D*0fPpgPI(=iwhnAV%)G`dC$X38{1>g2N(A>~W^*ebEAEgjK zoU}EJXf}>=v!QpOh^SG_%+5L3l|y6w&%?Msw^F5VEE+b#v^Ab#S7<2Q`kzlBMr=>& zKEJat@zmoQGJh74$zuA)Ma1rI&vULz%EML)Rj@xq9EK>#Swd|pAeGZ6PVU_zt?2wo zavu!Sb71bg?q>#XR{8KL;ARk*acLNz)c{u+b)Cq00|Z=NnA_nc7$+vi8p|IOKqYnc_+bi=Rn&@M^?w`i=OIJfp&92?}_n>}pvlM=?> z4Y$k+WFh%7 z6t9f0)QJw=Jgl9w2R4^(CofOV)kWvD9cZ$lg@4<|kJc+wfNRfXCkj|1!N9dexCHHh z(w>V{$69Xzol=}cwR{l-C3N0Xpn3pDG;_c1amj-l9_pIH_KzXO^QXTa(WS!BsEn72 z^P;$BSZ9jQm@^1GpIHB%>@jaR(~OLUisFPBt?l$YSqMhfPb=}t-EOnFu5uqrL<%k$(tdsiR zTEPn0(>^_W)+8FKv|p=O6I`nMp5VeiWW7r2Jrmll`xhs9HiVPSOXDQfN80_jvWH2> z#BTp~n!gCR!kjC&s3O4m={!~g#2Fk>{c!mw*}LT2Sgn1i{1{BT=g5x@-b9SeqZud+ zM8K4acs&wl3)59%?{hC^B8{%?rxKs4z$cS1Q0%xVEL33qy#5DAIK8pKQVVT3Sw=rL ze9IThaoAGvSJFo|BC~eg&*gz9WTzL?4E&`lF!k!v$l>vloyHD))|gTy;jA*6FH#Tz12L{7_*K z9;1X}8;i_Ws`vmm<-tMkbF+274*YcPqhp0>;_mMbx}5_XYaZ9R$Z5+7g z+jz9%lsSlKL5)Le_5pt5=fk28aO}!XQBOt?D?}^49_9XI21ZGndCwdNu-R+H$4W`Z z;K9x2(!vx6;I;4iv|#-Y?A7G?=(|Ij(3X+*oGyC-cFX;4TYP^Ca*sjp^&XWUB%Usw zXs=o!`1b*KO3xBKSiYzpw5r~Qy$su(eSysX%g=whS)sg9=P*Zkt%&6?G`6>-R9HI# z&hoiDE;r4=q}@d;DK0%gj*g1N&uk&%%iE2Cs(-{V{%DI;VR zMMy|irJ`(=k*o^IN;D8@;GWL4MURaJIC));AH-%K++^t;0=lwyG05jwfxio{9u2fdfO6SV zhPb-{phLqtcArrSIw7}vleJwI+H>S5>(l5k6te#j|Gn^3l&5`^uTn>X9P&=LVA(Db zI1IAMT1%!PQc2|VhGlWG3fOP>`%5Iaslpge{hff0l-KL6`)hAeo>m(b4v!iu_G|o5tb9&=Ce-;YS%2&2q)j)BQ0Q=e+F9_X};j?s2 z8I`D9Smzr+{rBqYy<2#F=c?6whBYl;Gtu1`Dh-n9<=nfwt_fkBNwBEf%@k*$h<+jApdXmJKSe)z@(_IoWFPZgYopRW4m z$UP~A?N@3q3VN=R!fwX=F>YW*ahk{b>0S+y3e7J$&L!}IPCJJ$ONC6xw2cwgU5rAE z<@Vpqn2w?kztVJXPFzFkcAqs8x`NQ(jF4nq;SA8fy895j*nJS{#3p)o3(fcL%KcIJ zaw-xPh|$Ot)+DoOTE$rAsiW;TH=WZwo{D5&3qNMc*C0>te)ngulji@u;)L&(@Gug7 zFv43y_!J1vH^HAJxQGO=i{Qu*d;_9)OZ35ro&u4dC30Ov-hjA+5&KzUZ$Qkm#5)`8 zB0vfDD2Jo`9hc{i@uJkXQ4s?Ve^FMk?G@Q5agdjGtiWNf466Koe~R^fEo}X5eyt=>XD&i=Rc<$}&g@(w%g)O^%qb8-E~*V}Ea!N^-Zt}x+Y+LHgwAB5O-T~SXEEt( z@D2msPnnu)QVxL{@n0=ycID&fNlZJWe1{^q2F}+;#@Dq>WgAiuWzV{ey`L`%?>9P0(GN2jJsa@aiCZEi;=a6i|=vW(uZ{$jE_P^6mox6RJo|Hu8&r zv;+cjpWOTWX}pYRwT{3WMfg?RMfxgXFYMr+J-hv|2pHFVC1_d5jDE2cEX-T&LbKRx zj8{(zxewN`Fs%ubYd12R77C3)`vc)M%$r5XnJ?0HNai%Yg<`104iHM3UP zhVdM%;HEfJV`PCaGChCTpT?iXw=Odtrg0ImvjR1c#_NJC5-;i5U-F^yT|J%e{%i#v z1xePl-Yt$(+CYmY<=~7Bzm{k{1uVEfxzV~)8Z>I>YM3f*K`bC@|D;FN=e;qa9uHdnv}nII{PDyCue#m|JCQ)Z(-{> z#E`F1i}UF-H&SlNYfbK1)rQ3Hz=_9B5^nBR}p5}Qq^i*+xa~WW?!n_7& zc+oro0-4Yw;P74DTZ+)j`}c%G!AYz)zx}|mUO&vFmiFM-{SO#woZz`1R1d}X9xlxM z+znfcggXw}w!_qW4WI33{2U*dVs0NA$BjiI%lbpV6q$UZNwm0Q@4Wug z?M@?^%c61e@G`Ed+k95Ztk*42J$Zhnf0KsRsl8P=YRfP(8FDgsk5+>8rBy zp^w&5$N#?hO}c!zWu>jM8h@NteV^x$jBmH7JbDokh7&}Tcw48v@sCkcC%)=1oOt9) zUUXn9RIWI0`M_}umcoEjq{pK$$D}fn#fgp#<^(On!>-^WnJ|mM1A@qTKvVS)T{Es- z_|*MMvm1+ZU*zqcRRS|PXTrPgkK({~^MVV1doYKAb~SgjB2fNr?&19U2NtL1cNeF> zqlDQf$OX3Erm!_#7Kt1v!MCj_<6lF|p!4grR7v{?sIi0Z)*~Yc)cU1xfwN|o^j2$Y zW*2&m*0leU2v=SVa1l%4h)9|=2BDH(2H{RtnHEWkVl8Q zF<1H?PEB3Ac(YCvT}$Oj@!PZvV>xZUi?Kkl8yb=m5UAV z6qn>gi~l3c9Z~J(vVQ}hV_XZYtj?u)mz5irG<4ucL-!7Ddff$Qg&C!*G@Ienlh)9h z+P9R2{Y@u7aE_7!_U)Yt;as9vgq5>t9*M!$McK)E_0O^3zV5*#$2=I*6QqB?FA|oo z)RB%geT7XkhhB_3R6w7R?dK;aze7tJHvCibN7xr}R*d}c4=#9Dy?HaOzmhX{d2Z56 zi29w`Gi*#(oUC+XcX$$`843z)yz%VsCUW3R*;AA6+sJX3FK7Os-5GY$@de#ELgS|G zy6z$-B0@f_wad>A=_7v6V<*TI2}+=)W_O?N%giYu=By zT*!m?-03olhqI&tuiwhon%5}yyLkd>Z5!l$p=bJi>1&8TTWM0aJ%?84&X>GoH#n4VkYdid1{{t>wOF7R%_dhb7soadXzt&HBf zNOA-xbL>Bs;zG;QmGCcUmu)6bD>(?`TPyI>xzdSgra2ftPoo0GKt`~SS+gzuK{FcN++!dpZ5 z6bQ~Y!Jj3#hy<^T;K&es1EP0J^udUp0+F93a$Q8;fVhJZ`&nXdK+LnmJ3DzXZ11K^ z$x!g0ZJ3Hf7^dr63{+U&jE5vhvMaEWls0O=hZ!gVt_bs7vikSgx_8$K!SiCB5Y zMW*xY-@#tWMgElT-T9*GJ)ZO}>8Q&9d>EwuEW)!12mCr5h|X}K6baa)5ii2 zPz;x2KZZe>Jg;Nj#br3a*+=2q&0J_a9v^p5m>Z~gWr!JYeWRGVkDlYndJgj!mxZSW zt1*wn@tt1fDR^pnHtwa)66I=8H$3b+L|O#~{_k9}iGWd%^L-Jn8z6(0?1LS0SCv%sev)q|J?8oZDXc`odxh3vlVfm(FV` z)o@Q@z*&|jzhLFfo|7V0V{q_qWbTG!44aEL{t{CK{hT{RX&f1R?X=G>8s7kS4kfXx zpR9!Myc#tdZtMg5zGbal$kYHk3!9on&QIXp_ueb|sGs(~ui$a5LK$TFqH(tNmJHDU zDb1?F%!?>ypTpy#xk1!{&B5#QY{BBW+w1S_w*Zyn0uOFtcdA>ASIJel?~T!GiyCpT z33J>u_7O#w7oB>?S^?;5QWBXqZ^ay=Yejcm>M?7%b`mzajmI41Pl?tX!&?1);_^HG zQrg+()e0O}C|Qb+$|k*+N!39GqxS7TNHLA-d^>6~Fq81@r#bI(a5_e2$Mn2$V&uMQ z_C*hTN-fh)Fx~>1_8wTg{*j(K=aFzA=E)#tqgcE+I>}CrOj!MMBKtR9ydWB4wLzTR z^ZLB(uPZWSiyvd|0VP%V?_&6~o6E{n3Bg>xyWh{EJ9xXIvxfxPTRx8abCxo-zmD`M zR{JdC=+Ncrk|&?hqjOjuQ=hmB|MCTAB^zU5IzNh z^G)z)2`(bR>moQZ1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c<$Rp8elDYZW5Jen{a3 zWGm3P#P+ocJD)Mo{Utg{c_fmd<@1akJpaMWmm7`2$~GMP~<_IyEZQd z)u~f%LTMopx|nn~cvLE{zmP2!|G7^6$(oS}Z=Wt2lD<3-w^bn<9TE$9{@=b`=T5Mb z^TQs+o!R>y9$M%t==l5;4^N&-OOO`9|g3T?&X*#O~L|Oa7@GT%cAz17GJbOSQiqgL;wmJ7ex{KxRjb z=f-MVu!3rK?7pF?vNrS%vYUAK#&`Spqe)@-c$h3Vhy%Wny;HkZVdKKnuMX`jK}&@9CYOZlH0%(+bZ)1 zzi|Ds#p*p9weg&^v)@ZMG#%}yx9+=5%+xn>J;en$#SKGCw(^3+BUWcwvV);{?X8D2 zp8`C?uW`JH@iESbx9i~O-igFKs%siqRgm*TZp-&M36K^NoMtll3{74t$O?UGz-iqp zD?u0jQDlS?s}=Y)QT5Z*>c! z0qa2>vx>k8%A;U9xV|^2wZ7Z9KF_|EKK@|+owc&DCt1Hu!*ag|i#8Q6lBU_ug)BQv zP@YY>Ma$onMb+g($>(7>7QQl|ew)1ulJiodG+2wUBELgjD@=jYUo;s$jtYW+FI5kA z$FPC|PorI#jVq+XLA&WQ?3lp=_xn;=ZHZV|Ot^EwpaeQ5a^<%B1;R9&$G3f-s6!sJ zcDY)AD}2{5n6f>23EF=d_&eHY!0TyyArKmhdnVelWsYL--U3 z&NsoICAf$LuZ!Tw5PSoocT4oah@JwGpCxi#MBae7gAw~#VsAjqv&1{wxp2`>J+c9+ z`Rnz5XWD`U#Z;&TUHl;BwGXV_DTL5LDLU5k34)oc|8jUgGlJDJ_ldo$oM0+fj^^qU0(37L4<*r?<9WWdBP$V;f4;>ByIoO2gdsp&X~8`k>4W*D2XSHMB|l%6VDaGJIxnlw~`6?`34$!pj%1I{e?-vQY#U_r1$;Rq`d~PUJi$xn-0z zmlRuny?u>+DQD(8S2Df0v^H0KS zHQTA;EzX%Avel`3KgA#aovBKNVqJeEr~_D=dn2Rv(=WI`Sk1h8U^9B+@T=o^|3?^+ z7V9LP$4Je)Tl-simm>>cD-J9ni7T z?U9-|2opMGUu)|2!aF0&5-&frV_?6*v)^--;+vX#JG*<4r3@4vHtqIQnq?&r#^%gZ;x zEx*f=CwVS_i#u;|HO&p-!Ex?_54ksivBlU{wmM(nchD_b@XT-g;{8YzsRiTgg^!F zIoY@87D?HyV~#Y=H*VvWCDHh^Q1$fpRD)Nhp!sQLwrJfxkX5L>Z{*rO^yExSA&qZ9 zWAMLP@7=g7zbhZXHtTV8oR*ClnUE^<9@Q1y!AzjrVx z?Yi#5MN#mQcSmp>TA=Kr3z`u9Cjic)diDc91W`|A7_Z)^GK=~%!?aSHHAmaR7il{Fi=-)BdWlhmR#^PdE0z$O$i-JX(+9CoTUu~Yz zwKxweXOs`WGo-?`dlE;}&v77@Mk&dQrQb;G>hoH0lix^R%1&97g-ZZYL!YgScZ8Aq zRge4g_w9gOnuIz_(pDr4R412HT|tH6`#s0^sspd`o>W}rjWStY?x>-QMAEZb@z=PH zf?FG(-;msV4Jpl7yek$xK$iNKpJ{v{7Oe#|jc8fZ_!%cIDMqL6C%aX53g7*574?xe zNfq|Rp>d&>^lphh7|~N8^0P#)i^v-gcQ9f@vM zvpD;kqN)!zmu~w@d4Es!PGETm!W;M0Z<)G+)T&+owjQ_uOz-^EEGr2_EtXEwu5{MG zM=-Uk$teq)#9Z;7eLzA#+`dF7nb`wI#V^~yy)w)=#nKbA*%D;>8-A{X1|VeM(hG+l z{rK;6br>_79hhp2Dvj0&gC3R{dprsSJJ*M}O2E@)qy?{0u{1%+g+!;*3418wbFED6|r zdqinY{x#bDIQ|ojS!wx8o3r?SN z_6Eu;U*fxs0uhh%>IBza7c|XvgzvfSMqp)rm*wx1QTX<(?eQ@u71TPL;{8Hl12|31 z7!~88Cx1!j3^zZcjF!$McZN>-qp(EF$w$My^SdP5AKOftEh9YD)iq9;zvit*^C>{f zl{IG?=NsRr(_bx(g-9?HEbS?oAt`?kSY{jB4um(R>d`ndFhQSxoo|3UOq<%1smGAZ zx4T=P$LWHHy)JS`o)`lG|K#EI{A@_6-=p-nsu2=vmOq>-l*|thr zKIT-`Cdq($1a9t0KfX#rl^eO&-`SrVoX2Wteyk6t49{G{%{c1*0i7#&57O84yR!Oe zJ4lVS6|L&x0;+B`y6=Z}0<))7$HmD&7=7BE#a$~5AG_0@e{lFX-uP5xve~E$|mUWPRk<} zBerW%uAay^UgY2&k`;m;cTOtsUIa1fDX)XuuYqtib5;9_6X12Z;t8*oI550f;ks6f z9(C>7Fi&Jv6sQEp#MCNcf$%|wg;5_p>dcw^xV?8HfnQba?fkauKq~Zhy@xrO`fcJx znxw%cU=zc{oxUv$)V<9<;Tfq+J(cZ56w}$X35S(v!^x2^1 zyVmAX+^3rVGaWq!4556MB zB8v7$pZOcJjW+)~8|%Hr%?Q!4c12S9m!au;CDjJL_8p`xOM34+L_8h&6ae z;?XLJYf6@W#GDuLW&b@pM?Z3h8uLqv2 zNsj&6jq%r^#Y;SIoAr@Km=#%nuAx7tplf?mKyu@ zg40@PMpN0#)W8@WF33@jD%JxBFYFej@w#BAz5O{FM}{2c8*(xwmltKZZ+V$tu^$*X zK2%N-bOg!YZT;7K3c6)B|JL)fHv6b4>$$EU?M20uKsr$WYl5ZHSOk5O_2VCu-i10Y zTu&~4#Dn}FRsVOM71J>eTz_Y~>AsSqZqbpg$|Trao$Ww$u=Cc^X9l1wzjv_utRa&7 zJ2cR_M+wxZJk69OK``rWPx(Y{B4J7Yvrt4KNJM;3iy6qq{qq(CZQ7jQLo?fK^)2O`7@ zth5791IK&eX=X~PC`C;BYD;zjnr{5H6rFb+oBbn^robOXZ3!Tq%CtdoZR(R>Mj2PdPKrk$#jT#NkNw<%!aR(3glzl{CAMupU@QR9v8E-PKzDl( zo6>mY>T`B-8E4KsqnQmr{Q}mU+3vvo^EdUOWMOiqe@{G9)LA6-*6!~jXXsk z-$1+TlY&O{bdbnh$~J9X4YYCb{1*4fO(>c9-q+1nSMY&;y|pK*}uEO)#jjFx}yaP=WcL&vLx?9(P1QY=b=mX>PE7~a^polCUY?R`={h^$7xh7 z{-Rgp<~DL*O%-Xn&=|P0Z~l2Fb`Ub>7-Y{nx8U*a!%wQ?mC>~wtZ@wjdqAUc=A`;| z9dIh(hv}jK9l5l#%5#v$$H~ds8@lXD^9pU_dYeP@W_~CGOKa3uSWu1h?jdk~o>$(XP-l(1_Z$VmEvM*i(_p3xUOsom9V_C-8kF2Vm{}f!S&0BJg)uCv6i5!lIJ zR{A9U0T2<968vF(m>U1E>3wH_G_~c>_0=U+5d{4whr7cQll?pWXN7 zklF#L3N|Gn&Bru8Nl>K0v(Ni&z-7MgY8(Q5RQ>Xi7a4dIwoeQ7{gs`>3pz$(Y@|%w z;XigGO~?{V8Re|qe5ruwLO%1lr#Dl^^x7BC1*-wd*pb~gK26~_K^!Jd)9KJF$@)af z%~TxXZh7~^?lk;$waakaAR4E;sAlN1>5;>xB0f0|vXkkRbRRx>szS-eG>p=dxha>0V zs3!TWq4zdHqcWhd5PrkY&XIgg^ZqxHVO6rUTWx3Z<0s(dla$S$FC8Yk)7(}A#Y$wQ z{Kmtoz8Qe;Ej9XWBP3^PeM_%hutcj#yNsNwg%RNuCw#Yrhmr7u5#AcYr$BJN3H~g> zMI`-?*F|t-2)+T)yCwQyL{EXp&l0&VB5y$4!HE4Vu{R**S>m1L-R|Nina4*?V7}V; z{Yy1Yw(H*dacKmqdxo_h+}IBDHA9t(Se237{nHu09vh;Zvmqbp!*tQrA}@h>UKPZ= z|LCEC!YQmoxeXgyO0brl@s_n;ckwk&%52&7c-;AZeD=w37PhWGu*dSg7}>HWYU%n7 z74oO^THHyORLP0zcLzxtQsjs-3w?X1&sZ;8X&_5?nj#7KoL)>!Vs>~40b%>rjGZ_@Eqat2aAes`6o{DQC}-=*xcU21GqUEnm)&h z)B2jn&YhkAgrE2L#s&MohtH%hC*ERy3sofc+l=ozjH1#8mE2Z!fJ@V_qs|?_FvH2~ zt21q*P`R8-^@Z(km|MGYy?ag*e94v`$?2B@Lt0lCGS{S$?8fLt7T#xAr1hf19Qz0O zL;hM%d}ASOz46(o5mw_N`q}Vm-9Fs@Pqv%Wl8&6n5#-h1D@0~w7C)P@MS{Fx% z|10bf9NwJy>L57LZbO%mKM#AGT{d5SLQigI%^OX5auV$FR_lBo=SpCu*>zxVDlr=$~& zU7TX}7*VB8uBI~nAZrOX6&ddNCl8^Qe4#s;FLvQS?Y3KJ zJ_Xoed$Wed`6l;y9KBI?-3oDe8r~Y5G)8J$l&N8f6HvkCYzmDdL*_YJ-pIixqZ&-xh|KBhgh;`d4u|Ys;=vIuy^5k z8r}Ia2wX@~IUK@-9-n@{+TJgq30G&_T%E zEUy_9CXKkHxZ`2C3R0Lq$T6wMLF0c8em)VR3bc#{%VS>V!yBItH(xethHLF7mohKq z!lYzZJE~zg-uvfDhlOY!d>fzk^GuH>_1UpK-d^Irv1rA-aM2+Rs%~H)`xKotHE1ij zr-O$?9-Z}nCOl+9e!k23S+|EtmD;w z@^o#oK^-Ls>8-il6Wz{1Eqs0bVT#*c^5M70|0Qisx-@yql5}GcU-6FdHSdrm^L*l4 zdb-aOOfc?BVyt2%FQ>WZwj}+d?CmKPlKv1FrueG z9u?BbJnC=oB^iIk2ba5GvJJaTC65@mJTf+`HOh;++B6)Ruj_!%g+I;r`e*u$?4CR9}%O$)ay(5!=`X`K`IEWWN!U6UlpN9+IV+t?G}~C<`=#{hwq`n_BkGGJ}oN~^)zn1+lmNhi`Vfs%j z@$zE-rqmXQ=b26%Sye_%Sr^aTzDb8{UFmtpGGxG7WPD3*-z&Vd*U`u8d^LXhY_p&r z`y2RAN%OaTN*;dV7M!1FC{Jybyd2QbrbDF*{4P8CU5k29UG5!kFAMccRB1-8J}pNy zyu;M3LJsIT_s-Y;9)LNMPabg}W2OpLuiEV8)I*NvcLY9_q9;pL=e>Jo{Rhu=%&J)V zi6Qv{yLVC_|3IT^y)o3e0l^-w%8(@R?zq78lUA z^?4wJ7Jjfb)WxaA1S>a%?{km6s%sN<>Tw}9H%WE3OMT91Sdor+rGYt#LjXLgXTLcq4LL)liN!( zu->1agx!z;O8^kK!28%WaoqoH`-hy z(8i@%(SvoZ$~GlOIwOXi^7^BPWk7>o#`eK-I%n24^g|(&%1*E?$LS*y+c`M>-kyr#=oU?v|QKWT?^{FX?X*_Z)Rt% z_ZXvvtIVUDPU!-yE`Z!i^ilAY;jRB@^Xzzfgijgmowd!S%O9bF37V#aTbVeI^Gut-!ZC{dLKq9G()FuSij= zgR&>^lphh7|~N8^0P#)i^v-gcQ9fyy@3(2kAF6|EVh-}4WLx7mkM4;h;jBrKI6W2gr+fC`g1kI%o-ld! zCifL?KGDjMxZ@E_w*A8uWncq2y0`Cj+IbgiWuDxBVM_))Frde;F!}@gEe|DVBr#KE z9;`$wr*Tqy`EBN7S?S1*qix4-?a9Q+>5rEBiY!syQhdSmn@;R}FKBen&Bs`2T%y-g z%M;lKGaPDTTflVMy`kKsi&*(+L28Fl2pVxY7VF>m7mIDZAXV{A8LUj(<@>m7LlL`- zbSs#j;eCPzd#=jX!7Yag+ zXFoj{=mvl%`?%y6y;s<(aN*Lt86){$XtaI3MeJ?epnX6XqH4l5eh{_3k5{=&luZ!n~aH z9urk_u-kXO!`)Q^9dSPXVryv+wwRRoJLnYe-8i*p6 zE2m9Ff_M=trTh_Ht;588HLj)#8%m02Dsr_PZ!n1NS?~=Y0*j00mqptvL`H<5gR=S zKCPaKql3ndhx$~)5ScT6NlzZZ!!z(|u8;*tW!8)ET+u;6T~p`zgxc}Tmu4>YV&gD( zN90`oNenGoydwfaZGdD;&E2v^GvwXa6eSZ>1?9n}l3f>n!U$6DSsJekUVHNAIe!r& z2rXpZT_?6k+1E07A@_MO94wtE{i&uv|Nja?)p5d&`>P2{?4ZE3cpwTFBhf(_7|lY zNnGN=TO38U5Yl@rxsn_N?C>;y7`+it zhpRtNoP3H&=jaww6eFI}XPw?nYwQ2Ix8@RKMaupQn0sEi^ixWvNVX6KuuL@dEAv0aEuYJ)0Xt=z! z$YK8s>wBTd8Fz#y_dlVkdiw*zTCHijd*cw1i;Sh3_)tfRN_*6j_ys>b9kSy5%1>4Za+GE1s8?#Z!p*w;P{o!zwj3<8 z(2<9q?bG?4Xz;p9jhWY0z~@HRIX?=iD`C4tMwA%r$gEoznto?=+^n zD6a?-Oog2EX)Ql?k!7&vr9e0jAJ%DFtJTlIf|?^*gas@5bSUY9hMjjbmTw!L^3>B)u{oWb@p@@1V&pCJ23ivGDz zVd%omr`aF&dV)fdU%JyuFJxqU(bwV-g6NJGM6>x>0x|9SL_ymDD7e<^p!qKZrLS$T zv~oIu1|0vbZRs9?X{|rSw`_Jn{d&H9diicJmsK|9LR|#*t48M&&r|}K& z>TSCD#OOKFwba&e(V9An_hkGd22(+Fw#93sD=l}q^6kL|nj+bl}E zgW<-?QnD*j1qz277+b69fd22Pq>7Rq$nn~q)vyaHpp!BC0Pd87!FPA<(r(&^AC7Rz z6;re@Lrh!4>fLbsd57(Z-95}e7Zu+YV`K#)yKk4=tEwlR%c|Y%mPwC7LNrpn;^{%{ zZjGXn5GRoJdqQUO$K4>!@R#nZY!{H(c=o)wnKg)wcr?Bia27Qe^`GdY3qoT*4^#Kg zSfbnUfdQ>{aiHUVB4$|Prh2`YQoXdx1;swUB4Sw@3&gGYst2vOsZ8|qtNF)Ipl2;I z!beX;f$D)Hv9o0xsO{$!1$K~?fzV~UjoX%eL4Y}Z7k%v?+@`M?5me#=0-~PStdLWn zG|$H0^6$>T(v`gT%w_UG;!$^Fk{%C8J5oo>=*gh4=Um2*s>&hZ6(@YRgolyvgAv{u z!lyuRz6t&;!9^r^T?9vl;2RLVTcQs}^c0BvERpLX@&?2mjM&c-djn#gCEnQzw*_?> zUq|?RCuinx_I})AEwwW>2*B0XGY*SiePK`$X*NWc7kKLrOTelHinv?y+g;I-r1-op z%j8FWq(<}0c%wx><+TDe+BG5!EV#}ZSuN?;;5M-@uYjF3V1JT z7L}13gL^*S8=?>Q!fy@09Yae=lqeH&mAZePBKc?S?9oMG!1F|0N@>9j!jC^*3aF~U zl~al%W!(p{#d6H5zt0-wNXw^tmBq~9ZM=OE|HUOzNuBJ$AM)I&?6HVFt~!VEMjxBH zJc~y63@78%2WUAb-HS%0iu^!WtGLG2m4x2i-k%kBst^}cTKx@l+X2qENgi0qvqi=H z>DG@7>+#C&caz3#2B6_JpLQ4j0km0zAyLk`8)tphEB3b94@#w0xAs`1VEdfnLJ8S0 zsFfs2w~dvL)=vWC73pF?S+179WI__*Yr@H5^Uv^?xgaLV#58EAlYYBAeV(K>dOfi% zXb13f|2#N)T^e0WZ#t%<6^FmQdRr;E5)QZS&@K3J;2_k$Sbvm^5Rah^!b#hD=*laDnyo>?@ECPS>eY6WebfHh!~H zmTD__e#4EQmU`*nEVoqnv$d;8*@Kz;xbHH4IJQ>rcj^|f9w_4E$UBNOXB#RTRlehW zH|w|FSO^5ylYhyjUk<}e@OT#cPbWyNqaLbC*8wSK;(q1Q?(bVn%iDg9sG$AtoT(fQ zIr#UB4cT(nlVRrYyw8bR8pk5CA-Aj29Q;v<62HuCg`^70YS;Z>Wwo05>)x8@_tF*) zshk9VPCU*QGuRIVYZFX%-ZMvMg^MF-Ttqx{p~;=b>w?MlNtHB?47So>>m9u>2sYMG zH!-)&P+BI##YLng!Rd<4S_d|WBhi>;3F!4*?Y+O-F-#06w9N${?2k_#`A`-kVf+Iul4T#}QF+0Wr@K?`-Av)B|UZ z8^FTwc>T4@jQHHUDx;9VBk+=Ac%;LqFmON5;}JIho8qRU(#w>2AEfG1l#VmlR6 z@!Ac|v^_K1(Z-OFeF3jdfb@>Py(4O6_|mO4MvGb*)I`#?eR$as{JimlzUNRK77sI8 z-Tqh~xcM%rmpO*Pwn~l{D*j%W@?-|tY?B3>xcY$K~`1>A<>W|Z+j;EuD-Y5`TXxMx69M(dOjbI`~7zx zO$Q$&JhJ-h$VYh~wL0SOez6Fy*q)3EPzgcjL@8L4?Gz!cyL^Ii*1rgej}FD^rz-Q(+NIxG|9?{s09x^&VI@Kxva`t<9 z82bI<4<`O*1*n6vlQV#e6Da&R`B#$SJcvlGPnTB;gnQIf3Vfcrpa<5Tt?G4Js6<$} zT%YD9ktvq)h6d*eDDa#3Y^U!Jg0G4zg@(Z~(tef2R$w9m60!Vz#sTB5sSZCK}RXzQT`LXbF&a^d-HAIJe>;-u&MlKdg3>#@u2hA zP<<{`6mRvA;$jYzMtd@;FJm6<-L(pK3dw~6q}ebRCH*ypsQ8$!<}+IxPLj)b&LS%>Yv$!Mc`snV20F3QB0TbO=a4=RuB%4_x3hBhv8L$5wvfapHu z>HqUE#_$(2|MP<_@3XV{-L;1L3yM;ob(ld{&D%|A1s2_aMpf*t)1$?P*jg3ZPLCu;e@U!L=aUb&{h4sU*5l9|gU z^_=>0^Wu#D=M~3&x44H9_k-cy8r-LVbG~u@EY3y5d0jY12Im{#y<5BwhW8Zk{4Ad9 z!t)0B9Sq;k;(G)9J&VuTdH>Usf%M|A5Z}7g34LdHmG^9-=mrbaPPJcF*2)BWEEA1G zj?EHZbwAU5^l_E&%GJd)A#t8KtFRVXscs0N+FVKsl)jL8)sSx7lL#n~hq?X4M{{sx zG^+8x7-{I!B>eu*g9wyw$Zw}8QVvqCS}ytZoEsQL@_3186{6>BC^wsDu0wrBT|pm~ zNgiJ6XO6`MRcONp${42OHc)YWeqU{*JFM&e;#bZWGt^Gc>%Pce4rtH+=X+<;oc6cs zrGq&rz(lRC(rx(^ILCLq&g)PRI^5dh`C|DzWYYCLR?v*(FFqJ!=lUiDYItA&q7=19 zt=A1`q#SMF3o|Dyx-OEV1<7H2pA^VIgR?|VYC1R2A04=0&VL-b|MP5c?D}EILHKV+ zzP3LIGp&<*jTwZzqtQdwT5xvl^%|?Yg*0j7Gl%#!cOH`H!H* zTw&s(brq=mKFfS=uNW0kySe#dm>2ZWKN(XfR0B?PRXJv5HK&Mn-bnkPBIfwct@2D5a9YGwtYUP_ZGG{GKTtE#OOk@E|@IW+Z-2 zXZ99`teT^RHy-T}S0)@wy!d56wdfDQdXEf5 zGO5l}d^-jitSa{Zd`S!0Hr-Z!DNO+?I~^G!NxgZvZ0gj-T}^PUVP4p7KLCi4;uW8i z9Knrb!wV+yaZq#M=OE{b49Ku%*vL#R1gc6AH+$V`1l_+cvG(b_95(6cGZIbk1Y;gw zXE)y)LmVY*UWF5~SjyMy3Y|)xK$&LCVt_#$61k{G=R!Sz75ENI8-7vSykj&Nn(=?UYF3&%)Cm_zOu~M06@? z%}g(HnNU()Ia&STFtqSOpM%lh7vbhYH)l2#1$4TH`ClIlZJBEO*HeJgyF35oXVJ0s zynC*Oj9}`&O>@0ER>*_))K|-=w4jQ*M_2UeAHt1s>#l!$gQLu9O#j}qRerZR{>|Ac zrMhRKze`b;`IUmOqhyfT;eMe3X);# zd_d%(CFUpb>cAyp{ZVnrU0@s4S#QMc03y{D4=LV?gBC?6coimdAm`H`%vFZcA(4an zwUN9O=)qwv=IsPNtV#c_%L&9AIwa}Z^d&GEYBcd%Qd8%{OlMtqwM!5;V}EckVLed@l$b(Isvy{epl@HI5}mqa9w?FuwG2>=XQmOr>h+k|LxW zQS$7=l}`lRD~|haaStQz2gAKJxK9D+eB=CCoQsI_x^RvR&Nsk&w|E~6?JZc;HyC)P>lcxCKUj(?Xph+XU(jJI z(`CZKEm?W`y+wj|+3>VG(GH5$b@yQF^@I#De~LZ^Ye>;F<-+PkTgdmR~7ZEIKqboS06q8D~h z`odQn59kdcww@)i>n-)LjnticGHNBLx;-%*I<-Xz75hR~!tjk4#FRU!_n95iYWo!{ zs!!rp34eIsY21PufAZX>!Pbb3iF$eRs&Zh|EB2(e*GWj^tCZ*)#wygXrZ&l&x*RU_ zFA=n)`$TI+j<3QPGD5e`|u73 z0ao3z@=>a>;OkC^YF(Kj;A_F;b$_q}RTsX~uPqwT$E#PepMBtle97uU`j&LSjf)>I zFD;Cqg1KIvZKm4LtzxD&s+Q}3Rl`Jn!T38Wx~Yro@ESmm-=FZUp?HEeNnMqWKb8hd z3EyF+Aob-<2zz;&iVP4vvK)H(x**^?a@CoVqaEd}Za?+9@-=)txvSM$lL7pR{l@w( zTNYB;QE(G9p$7xYJ;}=%+30BxGT-**TsUFciMD&x7Jbg+Y!O_=4=9Cp57&;*5h&fH z$LF0ypw}uMOI~(D(4jLU-)L60QAIl8XXGX`=x5{!cPTiEz4{*elf!-&ZS|3(3V6$p zG(QS!<`I=cg0@?R)sv}_^s%B-k>~f|%-aj&KUdv=xYpw$+dUbis<*7^8bpbBy3(AQ z4Ri;t`wLaaAIc(;xe8k`8hh}7SL+mqzcYBB*C)80Cyng6j8-B~$PkwT?y}MYBcN|O zJsMba0;zz`m-jT#gTHMG#fRb35Ougo%gVQQcycDj&9tl@Zm6UmrTLSBLSc%joIgJk z1dH#FcR8O02zg%eAhrR=JPYk&vH$&ng!hb!5 zR7t!pv@AtrOSB~y_T)byP2wBCPv_%}#$Ns;*rhcK@-R;lKj*zsKe5aLq%$^UN%>j$ z#nD?Y<9-&4UC8%bekSc;{g}x{oyb}QXfFaTGfAy$T4_wF=GA@K*2c0~_J`Ni$on2G7H z@aKietVjR(1Ew%`1~qqCtl7JYDWB9os^o|tueI|BZ@%BED>^2JktOxJUtFm`!FJ86 zv9T|p%gcPIHYtrgqoW+qy7&iuLbKjP{oEFucV6lmr#gXEu6-&eObJ5AZUq2Comt|x zs{C1I<9D#kWu6QT<3`w1-$y$U4 ziUmBvpNlJMsxyxE}Zt;B|{pNtcTp#qnLrYWzU zX}{ru{e zQ0}F{oGCm6OgpwwUfh0ovd>N=Oh|zi znN5N0oN4s1u-be5wjU^Bbh-0;-s!L6d0Rxgu8)5i3q}aNzt>^1 zfLE}D*3uD%XQk-tK+WctaZgZ&yPA3qy{AFaCfYWAj?}m1GqQD5>xYj=&wgP4)re}= zQbkJr7(yA(7`40+AP08({il8zt`gMq3qDY+|0UEEca!avC7_1kQuhvVzl1w33x1ta z3W4WqXQX|sGSQ;2N4p8k-_X~s>^poqohYB+e4&2#3i{luVW}$OGir8fvOH<}2%^gJ zB-E>b19|4Ke92&l8_Bj9NG6tvVo-6-+wVp}U@p~cwjTUKFL&m5L%ubLm@kX2)Hi)P0FYv3C<4IBZaoWJiaY4(zzIzHkSx(Z?SB ztsCgB>EO|*>3t%-_1Tj*g#Hpghxzp7e|!$3r`(^gxcI{MF1(v{XbXC*X{*s^LL5-2 z9FZ}2Dh!P~KaaKf`W~)3123I+s7BZ7Y7<32e@1)RPMZ1hi-T|z@te<6#i637r#}yq ze7E7XMX&yO7|)ZHeY7F@!IUZ3&C@=$6BcyN1Ts8sCH$e3m#K^YOnfstA4lTPqAv=B zhDcmQlsk6OfW+&9wS&j4NE{h7QEIyV_!D;UvEYrSofHc+xZF0-n*Nt?JgKPDY9R*= z*KRK+##D{ex{8<4z7vCuvf8L$dFUki6LK@Ue}KDtcr5R^wuats~G z0WWWW%)dE17x8t%?&xFGq005V`7uwFV%2a^-{mH3oqtvPUV}esXe(KEDuN6v*d#oo zM_!{c{ykNcynoPH+VW#B$mh`p*J*m!tu;7288IHT*@Bmq{I}eS7tlE>)}!Pq92n`I zW~A5>2@Mn$<|FA-aCGCL?#4BCZ0@#v_W%@0;xHS@HRgSS?FYU%fZ9$oiuuQcgLC5JZFoO#IrU(Ck3iPAd3aq_z5 zmCPdanHF1B1XD7+wCLd1Ecps{j*OC|TNedGS>$CyQkw*&@KfV2E9=l_aqEvMDO=$@ z*N5Jr>w~ataN}62-5G#;#c|&)?qS6JV7Rvi_bK3fWAj9w(5yM`;t}Ep_xhQVOow@ADRbw-wXl%to)>TgWDcfj3hZ-%arKF3{sV~pG0Ger3kbK#$6%LK8}cJP#C zg{Z~y^A^vIEO=ep!qBH zt+E3ZayYTmtm)bpSpG)mH!izQxV1T*t5J*v(w_eOBif}NJ%SUP4vdVxV zb+JpXAND?=&h#G`nMpry_4a*C_^B(sxI8ue}OiBtiLl~PyzEky+37tQ3|pW{I>VVoC!2f$=yYg%i)>ON8JuP4d~=+ zFvF0X0#9aO?mzPBpwtPSL>&<8G8L>RH^eEMj8lf8W_9C&-_xLb65t--4C-xSFRe2s=JG%yV58wCK z>!(0o7d?7&xqK5|KG=?bcXk_{=UWIN7bkfdtW(zFzz|$8eDI)$c?*sD7Jp**!98gI z`^NLyXTz}k9XlUS{WY{PYt5Qb@h;R}<&?CY*9UV?SoHS7AJHMViV&qOd+7Rr+zm&l z5PeN%bI=2&~TVD4B9p65VvHdQGX9j25x@Zdr}+L229V zU23mIpykVw)Vju%uw~4BPSXn+XxfV|@jK;D(8HJRc8xCxKsSEeW^MXU8safJd+#R6 zD{jJEvdl*E-Bu~Sf3`~UFzSAuVf}q%lQ4H)qf^#qiHOk1M+9c>5RA|C923cuMA_7? zwvza>=*pQ9Q7dH^xPIxb<)FJS>X4&%*sk*kWK=B_noPGyWHp^&v9lI}6c+@v(lgTG zUlLnfkAn))w8>6BOEN!rI;C(bdN&9yX&p!<3#%hob<=)(n)8KF;DE4X->o5f4$dha zUsxe9>+KzGBfV#VLTuK*Ia?`~wq!Z2g4*!$b|pr0ps`8_r+SkD+Q1jg`3^pWgnH;5 zo+QM;(dYdof{s=gv>~yvz4T{tw$xGody|?TBfVt)BXpK-oOX8YBIpk!cI< zx302c_6mepXS5P}biSic2_cjO`%Uyje(P|-VgTfX{fQ9>`i`PK{tHj$t)Dsv#I~qw zjD2I*;PoBy(ZME9ShF%w=7(1zTsWpvSa+Eh7}U4SRwRjnk!N52n~i(~E0437O0X5c zm&NLnSgx1DvBo;Ttc&?T_u+e7FYn0!+$)azZgCGI?gzuYHMma!=X~S*S)7ZA^SW@3 z49+*ed$)KW4DTu6`B^;Ih35_MI~cy7#rFpIdlsLw^FvbRAA5}9)%#bPCP;vZDb<0om3!Afg!}mE1G1l`AS6puRj|k z_>NrM<8(NBsWjq=5>+i+q3E%6)klR%6P8b|GkIqS80jS4Y=6<##g20Q* z`bYNSF=*(Dw62YZJ1U${@q1Dt6&7a=${c4(K=;)7U#CxQ6SNlCwe7F05Ly4S3bH-! zCq62?e%L$U77XIdtoL^e;IIdS?dP+-&_D7vHQ}YI@cYzGpRVQ+_$=jybJ(?6IIgt% zz-i?hykOWuGb5A%r^FtXrv8=zOFejH$>e(+3g8WBW1&w+pPqiY2*1vjBVZT&}ZHVw6HnT9eHdwXb7?MMwurjdM)(UbxjDr9(DD78XlX+B<@ zDW(iZOoUWMH^rm-TrWzVr-i_G-q6V2=Hr9j7%RX3_3#+zzvs$_a3;f?uXo;J^*QL= z*MlQXHa4ibe-(>F?GAw>SA^-!ksab6r`wmQJgN!j`Z?0I><4*!kDtM8+6LDNX z!W-?m|A(bYmJ!`KLRBAAF^MkAeE=va2bDiSX15kGgv$DT&6YaA&@U4Hj8ANyq1&e@ zOxCB0;n23J>XYX)P^(IIz1H?;aPqZ&mxnR_;O4vuZ?4}O?D#V;#Og^6{7wGvm%{@7 z;M@90?KSl^xXPu^>BHg2FsEd_^#{`T(AE1xcbV1JVXZGm&9W^}xaKZj&?+ehHf0*y zGD`BE!6()XZOXn8p98trge-FK^R|Vckex|ypmanjBud|f04~s* zATz-f%@1@|%nuKB6u}mt#<)K22~20+`{Axd7&;E5K8_`EgFwN_5knHcErP!4_&QTP z9CBC9hnnOSZ&T9Xqj;2C53Moj_0$oJ{hm|Qk| zuy9)Yjd~ays-C0B%@_gmehm1i>FI{*UC;U%HM2~ZsxORdA(wx2PMRoqQh8FC2k@>UCs65Jb?c&SkQc7g$ zp#(b}O*cFXp|I6aN@SF2^IoXnDolHOKuXeu7F$~2VKud6#a@|h=lBa!VoAiz&3m%T z@Hk)FOy7Ywm?e%ZlXD-S+D*Gx2&>Dm`{YH+;%skV#JP?+5cbgp|C?nz`LpnIC(a0` z_nyH0zM!Mh*e-fd%5kMn?02I^S!uB*LqDmGu_g$%=L+{2f4DIc=$@es`d&x`DRa=P=ae8d*-M_^7jT1V9#J_{!FYM zRwg6CqB^M|l?Z`RbIDEO)B=lYP>VO55x{QKIC2ZMea8GoGR+?aZ`uW(^zlIqYYg*O zkfWfoYDbCpX9|p#`tWocC!!N>y%!=`9-zUu-I^|(-66g{+SAxyy+io+{pDFs`WoVy zIx^?QEoF2!Tdm~Cq$e7oK$z3FQGqRbtsMO}Zo}c^F%7F)D`=pb){5omUz8y_-BfLA z3FW93V)HoYMn&}rIe}7d&`M?223t~}#9(`hZg40YzP!7{o)OTGrk2r*Qc9lzTHLNY zyAN~Whpek5+fSR(A2a9n95E$ekZ4#C&RT*FFlsajb!-z&b=MC|t<4i0oH8>fN0VWV zjIEb^eo64TRF+>C(E{|z8;=m|0vDJ$N{il%q=GEUnDe+w9>LV5$|BQOcYhY7A{CE6H=Rg^DkfOBMN>E%D_Xff-?y<0p{XxClg4r;>^G*!stjpJk*YI6Ki%;HuHd^b%@M8x_7 z9AmQasgrFO9eC#E@`&~+tTDd}H?w$wY#RTG%tx=$u_s-}g2~EY2g0yEV}&P}8)Ywj z@U;~E!ZPr=RJIa+ek8i_tD*;BJZeympHC>Hb4MRsgIdA_*Z3D0K46JZcvw;+sgZoP? zydXH{jqxX^e7Je8d@4S^5)CTL+}67s18)uO44$gu1uKtFu6;D&1Ip$8R_%@|NpRiyQp! z&>ce$dMS(+YkE6-$>r1zTCQ;A@tJOFWJo^3E@O`w8GdFOpsR0zG0DnF%A~g`6)5pH674M{h4Qj^u6Cpl`!= z{+4ug!3`&ULVP~3K&MEUI`?;{p-|=4kaL?vbldC{U%hS4|GeV3?-utk;(jpPTZ8)) zaLzZ*pT)U|IIj!m$l!bfymyQD!SJ2}o}a~YU3lIAzk}iXS$uDRzi06|JLD0gH_+$- zcNjXY8u8b|I~t;W3f}Kvv3Ckp<@eviO%hr?UQhvA@YYcI<5OR>S!KZ?(wrK!1yq#; z1zkfwwOrUOrO8Gs-Vgq)-4z7e%R>dqEJ>*UY|*}p zuw|BR7$A5Hvj(b&ye0D3?1^wR-9W8ujhrdZsl#qZ8DqevJDm0CImmgz0)5CHGeRf# ziKUU#Q^m~DC=3J_28SYGi~Kj@)u;X7m4RxR(P#e=&)?rJsk=TvFb^m@y7Qu)uyp0Z zpvC+RxWYH}vgSo=n6FMZc0JAly?s^CqL*~vulSZkWE|Xq`RN`ip1yVjNmyyI{W>C!xZsMUb7Jn<5{IJ!l=)WM;YcJvEjSM<2b4clb2 z*Ox)Z_h=zn+Ic(?i7SP<56^%9)Xf4a-rL-GWJL~5^+xn>5~EOOn##{%DTSzxC-|vJQQ3HEMaqUEUmLOE>g&#bcb@lje`*jqnG&Fl29fnRF z(RePX9RXiwGa9?zs%^~+vUrfMe*5q>1Eo?nESZbp;8nK#8Gp+DU zM%YjN+5bb~gqd~x&34OpgdDaG`RPK8gb0jfha`^$Vp6v0byXaXkXZ{%Q$4u=y4lG0 z^~8k$jE|E)&Hif!()C~X#KqQ3h#_wxio4AnyPscN<@e|wGWGgvpccUggkM*zoQYCL zUI{roe@AqM?Ao~{yg9s}^Rt>^u(J$S96;xH*_IEp+G-V_Ry&6TOKTblfKVu|ZqxHr zh6ixU9Y0pfuaBh#4E$=ZRmP%zbq_7Hm>>xaZ!N}j{U9clec!WLflw&bThT3o0e15v zbN25QMkH3x$6>WVA1V4RydO>H1m3DfUfp=$4LW7QkuO=%pxiGvh~!y9gsatEN&XM` zs?_E+5|<2C;50tYc0Mh5AM=w%nnQ0KI=+-hOa96m)fM`FiBug91zM zQ3BFELD}%G|Gzz#pQRu2@B7!ucYQYh-US+7SjfN6>tg3@9rK_TBEc5r_|oPSa@Dy@ zXoEu!VNkwrm=Fjf3)9DFccU$^16jFO5i55v$rqRBm%F?$w+cDxmx*>*N4#;0m{|!# zC3miF>WT{jelS+*rrKaML)&YQ84I9ApXa%uxmri~ zkz3(Fu!ftIld%3eSg5WK?nyLmQuskwbDsa>Sq@c9NkFx$I++o{z2dm<7WXjXelXly zgZmV4&Nt4V#kq($uM6kM;CutTcZ>JI@SXympT%=sc-{cNgW>yGd~bliXYo1Pm27Bz zzB2|H;6C)qXf_jx+RRBy2u?*bXcbx&DCpa=le;ogxxt zMD2gJ!XArk)R*NT8UkwV!j9=YL*ze}ms7hsW>||N#jU(&FnG;h#NV%S8TpbP^fBjQ z7<5C?ybBw10d4$JPrs1!V9I)KwkKVsF~3brT~|{c@fME^Q5Dw*S`-SpruEjqpeRD@ z{T>S@HXRxAJ6IkYkJ2os5mZ2az0v$ef9eLv>dUsKnz14Aw7YK8P@IMWn&r8t4rsCA z>l~8XsiSbA>=`+Rikau0-wS(2Ug zfjCC!__0ZKg8}iBl%NsUxeF!7#@Rj^+ki{pM1>eB1W`3x^rAl*3~61K%qqaRkQd({ zSQA+Y$dHGqb|7m2ByM$OB9KZPQG9GO&gUBjJy`MkmH*cYq*p4j1y=2$WD->r8AU1Z z)i1VdZHm-L(6e_Xd-vR+D)R|<$}wkf`A^PmlN~;c|HoWWnKnIg(}<%X*p3hJIa&YR z^qvDW-glN(+1DIg`mJc#ETjWme#m%+93MshZ0H6&I<|{4Y4oKzTmOK0GY_@)jQoX9 z+NQ>C04Iz#CeY#)r8dTu-pEGe^TukmtLpC=-onQ0`N*E3@kl#geDCP?17yaG;bLw~ z5*FlE^m1qUIdU$o6^~kwVog{6Camf_M=aEK8`KRx0`CvW7>`UOcBnni z=p=a=a>Mzu5n25x&NOD#88tD-nP_O?{f#Z4kpz;CiY(r3 zvBV^e`%IJOb+I#@5zl%c2W0u|BY#T@GbmQXt8eUz0a&G^eV^K6iLDhaTuMK#hy=f= z;&gaxi3A3udP&pygRz5(q3%vn{yL*-o3<C)+1&Kg%l4l8ieVwh9V$YW=Gi`C%}5r5$F5R~v-HxJGc;6qn05Sp#`h2jbN5V`)4_ZLrktCok&T6kgtOuIL%`c`vtu`uBEYZQ)4z!^?AXK8B5jE#XE5%I zd8oM7hYYs34Q)XIiP04i8WJL zRPmf`M31f)1&hZCBDyB?zZ0oXVPnjs zyPO7ycI=h-fTsb#U31zgJXaifH6ECt_%s%xqIp>2(`yD~fAv@O>@Z;2@$HMZYuC_| zof?q;8&1S(!S5ccn=DYKeHffXX9;}mI>U|;WHGXZfAQuqg8hl!XZL{VTo#{ph8@)M^|c*u z>P?L0kTt2$OcErB-Se_dwn3KVcUks{+2GnYDf7YJ1n7gD^qE_%*RU0arclop8i?%K z?^#FmZy@JSca2@L^ao3tg;hSKD-A#I<#OG!RE*w&`bj^tUw z?g%Y8ll&hD2-lH_;`fANo5fZ0pDBYhht%3-Z$Ge5I{$ilj160sm*FRIkC4n4B@`qc z2ln~+bS|kUj^q?|S{wWhfh1*Ddz51{z)3}|EQY_aAo}Ugb&Z8^sM2#|=3hR>>Olzq zzq{>!n`*>=_XH=-8|0qq*X$1)+Lj?6^QJu8BzJ^tdiai6UnnMa@zNTX?lM$W)v7ILV}=NZX76Zt z24WPTIXZ(ylrU-a{`Y6 zb?}@dK{yvWmzFNObW|T}5Ka1}T z@b@e}XCIHe*m6lMM84c)O);U?LwFu?+`RtV2n#R~IABZ41+{v+w(6AXNY7-Aw&{Hj ztaDWCTiZZ2$?sX?@o~}`84%YR>Ad8Gz4l#uu2|Lre#e*JdfaS@#Iy7aQ!-{khIAaI z3lm}BE&Hef*CPS!Vo%J?u`>`h`~4tCKKlYPW}x;tIkyPXD{eXeiYf+xJc9`1Q*_w< zPO9G*m@Z<{r%xweIjxF(%J%o>ycP{gc6g(`APlqr!Y6 ze*>kH`}Mmwg&*+}^JCv+U?X`+Jx_Fy-m`}erw09-v!0j!auT{?5oN>EloAA8tif3% z;zyMzl6Iw`|J!g5bUo!B>u$_ZtitFfcOBU^#M7F=M8LfW5`WH-aEeLy%6w)n=)L4ib-SA4U(lXgDa@BFY}TiQ68aB@r>HGb%MM$ zqW%b2T0ty21F_Yvc3_B~z7ldCg%joS!p7cFVvHWLjq-hO;Ud}@==PbTh>KrLR=l$+ zw#wo?7cDPHegtt`{PXy>+LE*PJ~7*CA9bNG7$IaDV#C^%w;te<#894N3v3MSsn z_a>+#zwH+T3C2TUXh)zxU)&pujX53ir9&0zll$u)GBOCxhdaNFyXlRcJT3L-=av%U zui`iFs5t-*`~L05j(K4bZfCaDGt{uw@1NEX$66q&zHOkoVS-KlT4c*_mj(GAv$6)u z*FccOQSTK~daO1AS|H!sLG9C#5lCbcE-*A6g{y-B?}WobuYWYyR4hNzRVj}hS1#AI z|H_P=8*2JX<-v@cQecuvi|__KDHHJlKN7)rA->ldx(N1)OTqEt5C>M<M*Ho#|m zNB9ryM$gdImSqiDIaqZ&caVB&M_(XMZb?JQ%z}5wiw;9)8pzK4%OP51J$Cz-kNK|r zXmplzw+%KbX>%vt6G{&oEB@Pa(Jr+n*lWvSqvSDhlXg^?+|7tAv7~dDW0qcCQmYX5 z^|BD5E>i~yX%{Lv!eET(8hUd%U9v|WF-B?zzq^HyPd&9@e=-I5<+TEThI%1g(ve)p zTW%rC#aC#Ol%{}9e21BTofksu-omwEVvDeIz;vT3lfa1})6&ZCg`9otbU7r^1o7ue zBa{2x3FeKT{@imgMW{x$p0RXYgggaZ${q@sLzB#>iHXn2F{$56r;cQ8qw*6M`U-{D zQIlgl!H;%gq4S=mSp>syXr#m-uFOLcJIUpmN#@IeeKHvuWf?qyHEzC858H`@L?V7W zIqb(n!iSb$DfSbvvmvQ8*rzktVCv$ji;{cj11Z~=4P2LjtWrJI@SXympT%=sc-{cNgW>yG zd~bliXYo19d!Ft2bgc>!LY6ooHpz)y&3~ij%IyP67!PTs&VNVyDVP&sL<=cA!Ggc2YYR)=9%^}9(_ z$BIE+=S0K{m$S%apW&c5^GkqPOZNs#uUpD6t*1&W( zLc8^}f`KJ7k1ao92PyjPlF1hx$DYB88W9i^!puF_WixODS*$qd`EpMSG%Ly8YVkY= zy-U2gJtrgwBv_wi*Hm_(&3}b13)8Hl!D-j$eeFBw%!yN5Csfn{EmvW+o}VVvq44q!KPwC3-Kvqy>rRVk z8ddmpZE|8ay#17#*mS_TXv-TWUuA*RM)U6!GY-V_X>HzZtqgI}wB<&r7(*$@XQR-j7SuxMWcCut&z!R-&-2e4s&avv{GZQ`PVQ)p zXqFbJJ)>F{0W(9(Y4JsB#TS9rbM^24c*&2?e5m=y@hrMt==6^-=fBatyh-97ke!UZwZ(H)}vzP5ts;dewlI|A(KXyf-ZKYrLA2LqsPknD&o+%RuEg z3zbQCTR4XyxRi8HK(AuJ5BOA*oDArw^&rC=dtD z{?V3^Bno53jdArgznP&}%5_-dD-0zq^2)R=)WPmil@Y^KpWtr-M=vptH=|jP_XCcQ z?rh$wQc-E7zUR(bBf0lsk{4!pipj*C0zulz`p=Gype+ru<0I_Xz-@yfiSk_usKG~8 z%)@&hrV;cYoh}X|+nytBD^?R|Rok1*^^{GbRa=9ac&!%rdL{NU9k)INt+E<~g+~9+ zD~|haaStQz2gAKJxK9D+eB=CCoQsI_x^RvR&Nsk&w|E~6?X7H?s2?)g*c%3%#?lZ4MRTl#jCSEqrBSCbP2zW6fL zp`!>*T16U^)X%_?Ve$KxAE(ekPv^VT9bZs~;j2-4foFk#0{6i(y&`n)p`ead0~KUq zp~(C2VHrGnwFA9J-U^Gad~MQ`s6ywY<&9I`HKTVKg>rbI{WwEt~}E{LZ?+nNdV_XTXBdw>4ybC+I2+@cvVk&Rf$ ztnm~hU$!@-bJ+T1*tjMVpAv^O_IW}1m+$;B0Mju4azBf0DhfZ#&iX4Km7rYy;c21FdIXI z{wzp#o*9I#5}E}in%@zEtKTb}{mu(M>UH(R-{S$snbe+K39P_WA^Pw?pIw61)&KJ> z86}R#{PTZg2Wv-npVNU#lw$jjzZ8aK<-Vj91PVaqHKP=Hy~@zrJgLVd?h#B!aJ@w0 zaiG%1XVpnPaX9kMxDcto2q&I@7U$H<2950o%LXMa65l5BuEbQGfKZk|$KHX%kP5HU zt#g`!(EX`OLz*>baP?k`F7MP8z`xE)-<9hG_DEQ3?42Wce?V#EU=RlFL==86wvUEx zym+8!Q~D5u-@DFzuRIhi2+FMAdUpo9&7jh7;zcreE8mb+!F&%~eH<#<-6MwGIB{zC z0!64R1V8(S~e|mlEuW=fJOxn8WtHXQ1HxJtOtQqwq0JDHLO<-22c*`yC)QQNi2U*=BR)J(1>#+! zLB~_Qpxf+e$aX0oQq6)YH|qp}i#Al-JR{SnOs(;4h~za{z1$>e$Uy@tJ~d@2Op`(I z$F|Tr*Ux~GTYe*VHOaBF&5w`zoS1=iysU;KR8E7Cde`n6Z8OOHTaIYw1643Fp-B}L zI11B+aKu~<>_d-VqH8!6k&E(Hg(+JF7!q~ZuqN!?gQ{t~W)_E}A&$9;)=zHTXoHRfEtm8< zVMnpn5`qn(v-&(e(aN{%QwK8SjOH zm>TrwXEiBa^&D9F&W9tx#6@BtOSan9i49_~hkt9-9|DxJVc(u+JC42&YnXpnR)x~r z6g}lWd=4~Nhl#T6s)7dJqh}`l&q89!D)(clYEkNl+9X|?^Dtk8ERV}dF1!~td${qW zI>Z=C#-}+;30(a3{(B^%44xXqNK4YIfQP(iCA7&!v8vSD_XYZP(EYMZ25}>CZ1qB4 z31OZK`@QVRx?ZJ-C<+zoG7qaEG;BcbWUn-K|68Kp{apj(`q;S&dOr`yck!5d+NeBo zay{fu^QZwrdAGa*H&Z=7q4}2OQoSa$$N>H9nogV~ zWGU}|BjJi6@NK?Q^HFLQPP%=q@QK<6oa}gb0ac?wX1!wfFENOMNYAIHBmxo0msasN zd@l>qZSE|s?+?S7YHtHr%kt2x>~1SFt=*`gKhx+xKl8yTjZgWI63OFvHM3qw9ON^i zO5)~cK@)ig#XtWCyI4@)KQ4VVc-J{c;w3Zx{+<7i<0(Ex$M=r&2=sRC?YOMKIY7ct zXjbaV13T)f3ImiITK~+FP3kYgU#Ne~ka{)nbrmUPQr;UL+_V`P7|KQ^4xFx&@-eXT zmV~!CQz!aGT;gF>LME&%(B1O-LL*whF5CK23Png| zi|iQ@zgM5*cYpr@$H8$pU)SqApU=kw_x8{iU6g0S>x6!^-uIo zf(^`RJ`KJTy-el;c=1)*WB#6;%3$l7{FJ+dIMkqWyBDti3DJr=;?At~93v~gFwsYI z9!Wb8^G<`r8Co>VNVKKb0-PO3moQZ z1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c%Y$)~?*FI_?G>1bL%xUk{e=UmOw zS_R=ktCGQq3MLd6Ev{4JqQiG}`a{#UbU~+S^c}|_CR9+n&ptax9u%Fpe%Q8}1)eaN zYFN<3v4>z%Y*29un~Ro@RwHq#Wsl{?T$W`3w8k%jF2xGMB8_h+X)8Y=^6$w5E|o7~ z&#qC0XsPueBNa!!@_jxG9e8iO`L490VQ0)68SZlsSaP?$bqj|W@UabWq{Oha6wWS# ztybjrsyI4*x*99^7I)+Ez&f@RdR%N|V;XDr8q?jnNsr#Hm+#GsW5QJ!rDFcQXWb|E z!($9;@Q*6Juf1^-#+f@6?OorGJzU-n`$O_gV#^eo1~sEt{UL7_0rfU)>Fyc)_ntST zJ{c5C5we6uS+6G_P1>z9Zv3vne4Yi2IPA8S=T;z{wXb73?t_X~#852$a)*+h_+Qur{CD@j&-q`W|9hk7!Av@MdVHkE~z@=}$AgCIw zUnsBmt6>plGqL?F7b{7>DZ1bHKSV3rA)oT;b408y?#36+NW{=?)tt>i8t+T*UtWvg zLo=DLxLf2YqU|ibNsAdr(30_&^oO||@YtRCJJA=Mab2~a-Exmi(RJQ}Q&W5)_^qks zS*?8^!E~;;$y*^u)S{VA_Ib=Tyda#PjWWLzJZFEZ#&y6EwU!ugO&IpW+o<GFXX&UUomp#~Lh5yApe6)Se~$fW`!t9w9dLCh?G^)Bv6t`Z&j^6U{E6E$ z1qab?=68A5!YI&&(+7XsRvf~A41@-+y>$bXg`9KeHSK|EL*K2b97U8({;3(_CWyX2 z7v>ses*E4Hl=3w-#S2^xZ{%ZlI0H)QUN$;zDWhcOZ@%|VG$SGdfI5Ih26r@Ei4L(m z3&dxuMhdA+fcb=XRa1mJV8b2j+T%ZA5fve&_KsE9qBSjcBKIg$>wHkNEx1vvi~Qr|Uoehh!zc-ajtA$kS_s*GRnNq6l%L zJ~AiBps2pT_?;6lpQBl2HgW)C99!0g4dJ|9mIa*D>;*THP{H1 z@9k95_q1Y>Au>`95mPO6Z3*#G!R7eM7yVDO;7rc3OOt|H_#DNFgXYX~c<@1%H@Z(u z(eia80n-9UwD_@Lc)G1E`dsh%*fXtjsEUI9fPTdj=x112bN7q|9?Yj(@h06AeJ-04 zzp(ZQDowJ7bk*`mn%G-pLnPt-bCUda)U7QYz2aC2(MLvV2 z)l+390;+h--jLnyH*@elOtq>``UFU$T1Z+}UquRk#9Yks+(8;ICT~a3tYN`T9yQHl z=i$wLGp|gieBq5>=ae}v@uRQOZ?fx}2%>C@RJBwd`*GUc^cv<$8JOZ0*J))L1VfJR z7fzZNN0WEbYYI2{(aff=j|$KJ!Ng8-Nf_o+!6{Yo1|-P>URtm@#MtT#8rQz|y*<{4 z(Pi3e-tqs8ai?()@@IX+n3^N^d)yd82(LKdyCpn~gddFX)(}1gg7Z!AX9+GM!RsP8 zG6dg%=-m>1FrueGl~BAy%P#D&8Yx-rnLJ`<-P}?MU+8D<&G$xb?J>xj;#mO@Jq0Cwz32gD)%O1`u-v% zd-HgC5d->p*_eKX#G{vp*#G+Nj4xa(PwnYG?*KwHYd_lS?;#EM!iQzb4x%m893@_D z?D&NjwNX!Z&H^OAVm;B=3})XF@{$wXu9Lh&y-zN52FaUJpyYe}0iy|Y?Y2@~!OjHs z_Fbo3LKJu0bmHYuyna}%%KZCT+(|#>8R-s&3;gt|N!u8}EQCW<*hzbXYRmUIhaK%< zI=hq4WzwA0S==glUoML0w;g<->*fqcbbdV$)ZM`LC%Sz~@D|34j^D9o|1r7FJ%0B={Agvu=Ynrv^>YwPA;YmvkYSkeZT3=UOPekk}w8-Xbw+?@}^mC z+QYqtif*Z`D^Mxy-1~6%5a61wmSJ3x3ihZ3!-isCfiI$MA=hftAl8sR&dm~t-s?`& zj^=v`2ZA3SqB@fTQWMH?~jVf^W88UJmBGepxf*Lnz9uM|0Ugw)%8oWj2m zmn97zd$Gi^)v3UIX4)D}Azsw0mxIdp0R`GPU0856P8)u@{MXG=-3>Agl25R=m;&J< z{xdyV{3!mo(g`_0jRKk@uaCOihEocSLtg*+fpmuvsb8Y8z@yMifcM@K^4aQ6!HWw+ z*y9=vr{62{2;ViE+NlTsbf@ap=$H~zY(FW|VtWE6KlcBxKv)zuxN(j1F_WVwUYa@n z^D~e9;Jf&g{0BH@+tC$?e5B+ROjzP`1s)UcJ+AJ7(ns;gy;+K-?8+cx+h8i zOm=uO2#?Hs^6%6ZO}{W{oC*6N`R)y2~bBahqX zWCL&?ee}XV4frT4-S+Z6fq&zYI3KE-0Or5W&w2`%041`Gd1ZD(yr}LZeSu60d@e5R zA9JZ2xgqu&W`zF2EQe|hKUAU6{FZNNex4%a{mFdf>K-fHpL0Jt=i?$`^@MB8w}lg@ z=&*XQJLdrN|HLv~=J$Y>4s7!dBhnz%S8LRWdl@OXVY|+-CWy~OkMX5bx`3=Pt@@C# zI6%)cyRn%S1+DI$6EGEjhh?fJq;V}hLF#vt)&hp_Z!)sK#I3}6SYn<}** zhL3hTs_DiL<97#+{p9l@N4IP5wfyd|fnr>pL4K?u!2DR)n}m~ku<(o@U+k4tGSYXZTtS-N!kjB4|ktXyUd={e_Vk>6|*H}F&$OlB=+FSA&yO^6m zfB4XuGb<^ekv~E&>q#|W8JCPx9mxhIOEr6c^Q}OHiXQjBy#cc&2r`H~13qoD2svj4 zf}w+F#Q&2g!^7~W{mF|iFz9U16VBuu(Dj75FhrjV_q|E8GiK%v6$d%knQT8|POGyC zVOy_}i=yo}jJnmKOix8M{HY8aR3k915*zRxiodYrUBh68s%Um&2(nz?TDb^xV_#Su`RcW9VkR$7nbwh2AqRa64|km)aZ!`D`16-g z$iG`~jakbE)+?}NSJ&G>`H+Nrb#B@y*?MlG3e}?+wyP7D>vakRcIWL^+Dczc|M~2^O#-rAsUbC@04Ra|Ox8Y*by zE|x8v22uCoyB}r@BQpaLQY0P+vN`o!i_{ZGg5w^`k@|}m=dFIROp`{$TQqwxWn&VH zimi8!;m^SYPJH__aBm)~_q$w6CenzElvb5yG|=Pq5@!24$wkr4iA)KJ5Bhi%7dy={ zSAINP`wwl@Asl3XI{!reyA^IZo*L6wEQrTYPM^J%fdQW29?KpNYrOkpd*caq4!qy* zspBEVx8TzUmNRkXR``NuNkpIqf-hNSm=1WA0_s1bLM5cF7QM)A+KW6)(7aQEuhI74}Ok0zRd=dmDqOSk}z)Z&7~ z&0k^rrm4T&<}wAchcA{Ke5wlxuQ=hmB|MCTAB^zU5IzNh^G)z)2`(bR>moQZ1mA$@ z-4cB;qNhOQXNg=FkvAajV8niw*c%Y<+5elf37^(V5NWjc=TCE{4iP+fcc63ZPymdV zICk}loGkin-Qe+Dx+$>fsJ^1DtN>SUSB#XQtytqpxe?Y03Y=T`ky_X2HX{4>LT9&* zBM?%n!A*9L!KjDc;BNmW0(j%-Zpt&`DvHW=3n^1b<~Mz}@4ctsrjV;tgvNFqbL{V} z#!)SB^gwHNg5x08pZ0f`;qNGR=<~#tnWIyPP&^sK(~s?#YWNrSESwj=6wiG!XNVK; zyl_=Z=&%^-_X=@d%9X=6{$#=u(jDxHLF_1Ho+DV88T~SDk@vLmDpHDZDQE5Sg_vP@hhv71)pis{7~Jf4?fO+J~zzC3_s`| zjp_I6MbdAwaECo@#HK#emA)Bh#gyueyh@+CQto1pleKWj-NpYdqsf$KPCQ6Cw!-PVpA%zAWy z?ZZ1{!zf%v%H%4>CSjO9=R?knRa z`2pgR$HdW>wt*dPr1>fKE>GH^&=+5{GNo4B{tW&ykZ3<{mT0Z1yW?GUU;MAG`_FoCf{<}VC1g6W%FD%)e1v@)zyN!;t=-_=C@#E%ykV~oGE}jZz!Y@jkFL+UR&k@!}A)k~Bz=rFs)+4>v~R6a6Mu&Y1{4>_bFfnN&(oXHbM1`FH)Yh({9 z@kb8r_#lF&JB(ps$$#W@jH@e1>ic%Kygh;+w=WTqO{f3O!rd3lRFI>5d~u{&$xBz$Q?;j#Aq3ixvxOxBY8 z%ztB61O9nKPm(Bm|9p0nUbp@Kd6pihy{6?tts#@R7}cnP9r*1}(J}}ufrv?^a(l4~ zd=(_JW!gN6G#k<$VIk!c+DV(IlO|c%w^7>&QcoON8Qqc~^%pVFt0d`Wt%G+Nj5#GZ zap2dNQsYE_oxr8{?&8XoD)?YmS|;NYb98WRsZ_w!6*VL8jBK;ALvL%{PA<+iLw{az zGJEplI?U`#)t)$U79Z96@crKCIaK`0)$?NUA+WKOsEH_zzy7tcM zitGwf`{rxW_kwjq#?kN8<(^fn@{ga0lf)UAlW@FfS@9a&GkIwJLref&j#`PNv*kqJ z^@p|S*3jUa6WY727Ds?ehOx{vUldfeQ696pC5ldOXQbRWWRqQUEOK%U0 zD0nsr{ltuo;E=>tSgWla2<$e>5B^K?9flk&zwX|OY2}E$sy;S=`E88v-#hme$@0-1 zZbEc`a`z;@enj@)7n<|*PhA1w6(@YRgolyvgAv{u!lyuRz6t&;!9^r^T?9vl;2RLV zTcQs}^c0BvERpLX@&?2mjM&c-djsM? zE`pGC#y>W`zY(KDWa*ud7ESXt&_GxZ;=Q^19)`_Q$|7x8;M2EV$@IxB7LYHOK4v#55Zt>7NKhg{z zQKlBVvC@lK-)alpCglxM`E#Pq{yq=MoY-eMNc-6fuUP96NqYn1`}D6leDq-bGxNVJ zdX<=1vrqHN10j5{L_AUZ*JT)0*k-%3vw+<0Z>oWF*_d)aBtb%wdc*qCtrY zN3|3ILxLJ~zJ4OH=p+j?`mS%eV6P1Lc`AT&a5>h}9LFO^`vtMAeHqMrtr)TXm7t-@ z@CCyJZnhlYHwU|s4V?wX4)7ZJ+NkO0D}XE_-`4enqUqBK5A_>cpe(f|I*G z0-HDLz>nrQ^_h=1;maq-Fk!u$s2DZRxr4mXklTT>GFdqg{-V+GXl{!`4davaPxh-o z%jjdjYDY?e#O;|dibt_1%@ga)`K3I#o^Ud^@OL)IxD-GCZeJieZ4+G3>ScUw zAY(*Dd_dm0)@kHx#$_p=5*^@5eME1-`2^5UX?e&Te;fvNOg!=`mqBT(3xiWC4xnE# zQ94+l1B(2j9;cJ~3Tm_RiGu6yfRJpkWTPaSt9^OV~wh13r-t`|88hx3uokvL|3EV*Et? zo(25bF0L$4E=uy8s7)1-elEAreq)U)(p+fg9N>Pyk19l|<^~AZ0WQfL{(!oR!1J)l z!&g6~pt0+6aY?ivSZsZChvZp8mip*KNd6B@MEh1ciA#@66rNWi@sg39WT_-RMSA$& zPFN{*hzYFSG{|IrbsVI(WGHH934r_d`xo~L&ja=yZRXMu4_M|FP&>j+;vl$6QCH?^ zfyxSVp`PzIV5J&tncsT|rXvE!$5LFtSzfm`T&*37J)Z2Vn-7AwxC?kTBOP&%dkT8) zalT;9}N58bUKAvtTO?y_R{fbAfp4#-s|i> zn%<8-Jf5jNZm)=gPqOXHXKsTM1BoZ;PalDzko6t(X$s74Ua|jOHjMqJ%=&5b500ok z^PRiAK8ozuQLAT)l?A@*R)Nru5sKc@EeaZQ2iuY=Qg>cx!-(%;l{O3Cu<}rWA3keq zi0km|(UJS4clMVhW@|Z56w}$X35S(vU-bSzxEbg(Ji@|=yK8Gc zUeeq|*lsYMojPz9rgZdu7NQNX*at-&^L9K4_;v6uAuCl+9RKHmLaH?o$;z=uCnh3R@#lZU#rfZNWw#K+Q? zz^A|?W^DzGcyiSHAAOfTV02f#We)TD!K{(jjTUn*P|5r6?+_zjaKY-K-@$8B7`3#r zgOh(1BB-t2oK@D2)ZV-=rN-$CuB)uyNi@}mYz>;H%<#+b+Rqo8OKhHSRp$NNDPC>V zu<7Gm)eSln|NLq&1CYG;4Q6-FZpfk4-v+W9#m?XGWhZ}xW`;T9k znK#NkXBm61XBdZSmKy%|h(9EZ!fZw9@RT+vm_FUm_3?&!8@ zNyL)D7<};CeU0sa8(Px!erS`y3Z3{8qqWoe30hxhh;nQ^i>igb*?t^&1sWQzozZTT z1k8-A{Cra1k=so(i&sw!ka+p+6nkC!(7*$1`fjB$ux3xdF?7cduGP-8*&aWH2DD#% z_NRc9*N(CM%#bdKKCYs++_O%A;@GeB&-s3^!y{VkXRj1GzC-nRl}ZHFeM#rvuFr-J zsWy~-e{d1%9oKy+qoD>QLf>YuUNV7Ho9q_eY@3LXbg5Oo);sLRCadPfwNIF5M+EwA za|QY8EL?Ng-3hiV?X)>$S%NHIx5I;r;V^uqJx)`X9n}XlKEi`Lh)31X%V#7%bFjw@ z{?8j4awX*@zI+CrXK6dX^~oIi7zQ|=lQ4mwGP)Oy6O`fmkxQ3Iykz7?U%)dG#}grI z`^rG#%OTf9uEj>P=OH)3^rEv2s*x!DN~IThM^HZPDlGLA8M^!G)tizUdGt5UHjX(6 zqIbg6tApuu@xU7QE-@to{95&AI)v8(Cwt5Mp(72&KiC?zRvoMbtgJM?cZXeYwgT6# z99}KF!8}0nQ)Uf#%TQRZCFF`bbZ;8|PFBNDHFy59WP%`%J?kE8whL}pC>~uUZ~`ZD z`D^#;Bm|npbgy2s7~<8h)0P!Dc%i(@i>YB=ePDzsogXyaLfWT19TX&bkSG@2i|Z3< zn10Lr+3T0C!1WHPSSIb8;Km3=nN@`lS{&TO8Pz<1k^OvkB3J$aru?qd$HQ3*|`Y0+|3m~&*QS`OP9A^1^Nu<2t z4>x3;^IY6maM}q5N!}b9bnO3o#R=an;bA2FV1&1Z@F@_SZ-PHda1jY!7r~Js_y$Dp zmgs{KJq02^OXRwUya90sBlfez-hg<|5_9&W0n=XAIYrc5erCD&+f|Z();Tr&p#@Ya z#_x{NcOhaT_*4HSa$GU5nNIq_GP1I3Sw$^*2YfS=4~#1Ch8MyOnr;s5L&M(}%|wPD z#9ueMWVqbx#W3mK3m>0df=_PN^f>aHf%A{w@K(P(35+U#F$!ArVVMQ4*LuIsV4jxh zuB<$yJgfF3`yKXe|dwB)Y^JV(j6?w*)eWO#T78B@7Dd>8!(>=_xnkD&&t?r>~f6V z1`P3*2UwDX@H|OqvsxeoB;MBBdxppXzK)qsAC62Q%1P@6#-(4d{o8|T7R16~~<@ox-n%0r&nNc49oMnuf#_nfQ{BMj|ja6|#feyaCy}mY1q-vSg-jRAp|G5YAq{_I&;bsPf%HDAQCA}kQpFaA@ z@aYr?a~NrS+D6)EIz*Or$n{}Mt-XgvCfQ({nfK;y>PeXE^ANXwtOsi!EL;A*k7JxJ z>E3#WiV&^=daCC85WB&XeISl=MPo#(SWG>e9JIwm)ymk9)m^@)t@Nc(kmNu5^B>gd z#`5upUL2B)Q#H?TzNXveX;fxJGlzB@2 zKW`|%@oxG*pWRqeX5Bx}QVN>s`_KP@6fY;J=QF~kMK$mGm3j?zvxHxUOXM*An?jpj z*=Ai9U*Y~zdV2U;tg}z+b`n_s&?cuQ;tX>0{n&YB?*n<)x}DzP>)>m7w{6PrTzGnU zV#DS9OXwQ1l8P0j!9n-CSMEhzgPB#YPaK_MMJMkudZwlMKsM}q*w;USP>ZAT;x`di zRD^2ocUXrf{6QDf!ss0VtI`#e59}O5HyduT7KPeFZ$0sb$Oo}-*koIjI-MKsU(^#H z33C7xCLbd9e|UoV{XKu`j(!btt6`(}mWcpJHtV|5mb;h=8u+h;!!t=mFxCCYm0%6qwja>zA7OB|vv^jrz{&^<~Hq^ZZ$@5}wXBCw3`xCqf- zP$c;ZS>{8z)lHzH9>0u>jXs#yqP2fNoPtcgp&c8(>4~Wy>1fdN&BFxnyuOl*uDZD^ z>F!at`!&9;w#es@x}KekUM|={CNSM{<7KRRBr>eiFt1ATTX1~TO`}O2z}jRPxVk%- zf%n*Hm-~7ufR8s=xO!F|5MFV@cT0E}2|pO&ts#601m~OJ&k|fjg4acGWC*?i(Yq!3 zU_?)W$j=hFE+TJ0+`)+bEU`Br-m}D6BpXN?k^-u`#@V+WlmWq6QEq zKXH2UdLuUfif;6YeKjJyC0R;yBNh>;6v|hM`B0Zlb>^=mix|k=ws%$=6b1}}3WutK z_Ufp<{XTv5h!0dry2Jfx)EEBxb`gL7G!oWhLKi$+eBgRKC2~%L=E1rFxSj9|ox~ zKp9XV56{PakqqC@4C*^WFm6sMFr;Ap=2wdZ(0FwGtNKk67lvn#Ry9@$yz#%N+~MU0 zy1hG=Qbw6`#v#3!v2c3DP&l_m&jt3M!$>i}wVt)L%i+rc(j9h0|9O_GU$730G*AQZ;`95mkSQDwz17+CmK2>1?jt2Tfhn7jA$u^I9UDSqkFJjAlw&}yM8u54e*ytgrAnq1Gg;( z9mb6A0h5cdJ^VMW1F8_+?4yCdF$at5(!(X1K)rNbhC?eF+}F5u<*e%lMi#WBZi;T# z)hf4G->8cMr>yIW>`av&S3-S=o7lb zUL1pPTI{!}e*7HcW4^A}-158bDTJ(CL$NpSOVDLJ$6stvu|>dnyXKG77zDg z-?BXClO5!N9d8PIuca(JpP+re;T0LYb+_bsKZ(QK;LmlH?$CFv-(K>K`U4Fxl3Ww7 zeozc}MU-kg45`8Ev4?3Ye_q3KZU3rXRXKq?jAxS6|9&5dvb-Y^*!i%Ib8O=^t;3iG z4Y$RGKO3xIT7lxIzO6uQ=hmB|MCTAB^zU5IzNh^G)z)2`(bR>moQZ1mA$@ z-4cB;qNhOQXNg=FkvAajV8niw*c%Y>GB0vv7dSQ&8*;3ip@n*TxQ8fm2a*@qg15Ktq~q^l zviomEQw#G!&XM5qV&zQ@X7&8x9si-aU);~n42s2K#cpv&Bi0WUh&fTW{6%zR1>DwOk@rroc?q`EoJzm`IzwfG&d!qQqr75N;5|8A+MyKB_ zr{q!3fuz>aI8ppbXyny&=Op+|j>=($G+*nD`BzT8;KwKIb2Yk2`6Sqz*!Cq>8m$@< zj`bf|!IU@cZKOl}Amv_dlZ*@l%H~UF?Kmk0g{Ys^A3wcYCs1LyuDsQQ96Wx~sNS&& zS?)dUva8jG1-|g_I@Br$OKRS~;weU<&AuZ(?g!VAJ1OC(hI7V|922F5t``Sr&@VUSfp&`_MdSH$UNr! z*nfoy1ab$k$b&<`Y}DN5?|#wXSla?4Yq(I-Nlx!T`@(^j9to@+|tI{RkUSIfvyHul5XtwYIh{KLHE z#Y{>3D`VB^5VgNZb79`u*p)j_8*rC-E{WsaCi(W2%CC@xfn1kq?`!ae{UtVMt^@cL zy4I&V*F;JCxlZ1jNj!kB(M>hG2SZpGkhWf~TBQ1dkm$bJGVC*%-ty{mVNmvrdb2rQ z1Eh{j+8lT|g9J}6ljpqsiiDgLO;YQs$9gR#%f0R7K$mLehu#rwaNJVbXFz`qu?{fT z=$HMDEKW7lJ!WabDz!}6fZhSP6_ZV!md6bP<*QHbJ*5YN4;-%ByCI^E6qL6Wy# z_nu1PYz~&#`oJ~KCjm*#&Iq{1P6HPGq&Om7)=2+f8X;p^>QIZqF2H~FE4H@e<2>}H z6uT~?uUSo1jtxrgm^CyeBZ^}8)ne_qA>kD#e7A&$k??~N-WtNEKybbZ{w%>oBzRo} zM~2`V5WQQX4@UG9i2N*(>mu?7#2t*-&k}nB;yp{uS?o8o#M?Y%gjK$CqSDV~hk_G z<01;HXg>MJ>oP8S9RH6av-88X=^x*K@%}N%B{v*%O%;(pt0WJd`UkX0Jq2tw?vLRa zlL^dero`St-Vipv4KyC`>&1SVY#>ppBbbx$Qy%5x77#h67ya?b1Qs%PbzBgahN+cy z>d$fxfpffAX#7|)68F}L1X8+>Nf(F7T~jZ`x(DgbQ!69z_vtNd$Z!OFdcC->C#ntk zbNBC4ZKEozN{NMbpFk(}`sbdk_9X$hFm7_=-SxxZO06A*V#~V5A-**opJZ?BT=l{Q z&TlV}X5gRP#xH&Lwbh=uYNh8^yey3i^gYlaJQ@LPfTZ9CVGNLH?!qRZ)8gzI$~j!%;M zBwkO?Ag9&f3H8S3Q~ctPeeCE%ozPLl)t7U&>s24Jo6^`E#7K^l<;VPYyiE)i@qhGg zJf#cCHoecbNcmwhR8CGmy=SrPM`UUxIlr-k6(8GRjSh>M1)*1I9a~>d3 z|1K-;+B>xk_8H1 zFWe>dGLxk(4H_LwF)fCQSA$yxSUs1?;1zp_buM#s7TRne$%zuI%O`&#qf0-WMEM0! zY1$7Sm@YZm#->_2y^sR$$G`OdQzC?R&c5+^z(S6WTxv>t9Fqb&z})hzEV7M=004Tpim!O6hbd1GTquqLhu7!pV`G0}g7i}sDf`BeQ^on? z9Or8h-x4=n2`fryrBMDUk9VnVc8E-ZUPA;tJX<&UEAka0mV4Q*ujCb0&!kbWr*nhk zUoX7LaQi+Ia;B`(-Gdq8m{_sce=LCeaUH*!rV6b1ay22;rw1`HEA<(FGm4#H*ExPu zya_27F@3&oj5NPL8Wz5v5rAxrvmfYM1pm(~PWWyK4QtBG*Oa4Tw7!v7aUO2E==on6nHwa+I@Kzag58tHqu{U$KC@ zX5{yc_Cp70DaK{nW)1(yW>qEjx7cNkuU0EYlgLk7rU`bI&zRy5V>IwC8_*ig_-r{s zVu({rhQ11AgmV;AW)_#r5!*Wk&(t|fNvg{)m(`xtWBKG>(`Kh%A!M)Znl^IoAzI1e zSzaVAqL{MXy+b5km-Yx}w-$*b6VSpR38Jf!ZL5-_wx}xf7WqbVi0%cV>TYlAID%s> z=CAJR6dOQ^m(gmWPaBat&)&1pEOcVFGk2^88mvh?SVv9LeirEoJnk!nE=>1pu)P%>lOsz(wB+d*F7*Up zU)YVBxTYUr_f8vC>XX0(N#mu1GLUUYt($A?1nh+J2KBO_#+z$SlT%V)f9? zwYau@+|JNWd&k3rG*1UDb<}wiwNP2_2%qnn7oo|JsY%owF&sIP!k9uQin^*;8r%%l zf(BeY7beO_Fr#+SKxNYJ`>{^_MAhseK=S8cKlY7a2TOlGQOfVe#!Nn3cc~adMtHt> zeC*-?Mm^ipS!ze2li;#?V&-3rw?ILqeS8U{XRbz`{GdQvz~q5zo~$5SVNr@RhZnk5 z%_$tVCdUOh=4Lg6x-g0aFZCBcw~>c08&rQD3O`f_Dav z8#p#yhsFL?6Mtj)(Ttf>hRz1`_{9~@eL}(BkoIK$s7)0OI?rLWcy)vha&WG7X?F-f zrT8`U^h^cjDt@Lv;$#a}9QC%~QCJ-^d#+Tjj6xH<2(Df*XqEu}rN?x*-F{$)G9#8b z#^*54yU=Ltb1zaJ6!%;}TLXOC8#xeJO4>Wknzu%&&11_?vgcI}&SL}pMrR}~I}xq@ zl=f3Pzv@CapY1(Dr2)6P_$fC_W$4hBo+@N{9r1PWlAe8#gXG%%-5f4?j-;qA<}TCc zVK$S3p_iJE!cMW)Tg-*b;J`)@k82DYEEfq+PNVvWL@!?MTc-JnOt^1~+s55MvZOUS znEY;F?RA1>eMc04Sl`MFO}{b>cIae4xG5mv6(@YRgolyvgAv{u!lyuRz6t&;!9^r^ zT?9vl;2RLVTcQs}^c0BvERpLX@&?2mjM&c-djsMyHI zMjZ*s=q%h(fqfmTz9%jH1bJu%uJI}1NKEZ}?&59?D>mSWXd!XFg};={{o~I*UEWUp z$3={y)4u7dMK<{(tgyY3oD$K_iC!IM&3HPeX=*` zM#NGMHj|04f~bLwThNfSFKFtAPe~pa`Kcr|x|TqU@)v`cVPgTNNdMx`E=?>#rnHGW z3>6{NU#MkQRj&bmvu#19W(&|Se<;P9JRBS}3_n>};0Gvd`On3a--W~DH#g0>vf+~t zs&XNA1#rpn-rqONcfoRyL7!8XEBQm<)efP-phV?=W*0ipv+Heb@iYvoO{D=n&)hWt2k=;UbqqtwESN& z6}rq}#pM>_hch}bX*rWGmAm`F(j&>v#I=3U(Lyfhcbfz#tYY@srAGmWf_8BrOl~Y`F5b3HioK;l+PR6y&+;N!Hje9)6XC8BEa;_7s&$L z4mKon)7FvheX(Y25r;)au|=FUXhX9Txy6{=s(bSXb}?oVz4Ma=mp9$_algZ~?XPPL zj&!C4g}_!bql}{x%0xB_Iq^#&u1#~sg)rcx3uP=1|wjP7ltS&oC7Kb zC3j?tv_Ro8;||KnOW@1iS2qzLkzz?J(iemVlZgq&oJ?Q^J5P~qavH`v)TphsiUH=Bl!aKuS4g`X z7WSm;6Lz2K*)Oh}TbP32NwY~AE7)WF(q=%&5gz$Nch1BZw=v7AUNLyf0p1P61*;g zBSY{Fh~6#H2P1k4M1GdYbrE?3;todaXNkQ5@t!5-taoK_n&JWz8ZGLtJv6n9Em=&4 zyH=WlK3Pk{%5D$XC!rZvJ}UukdZ9(T0uxxGU!v~uvtN;y0tbrvw@Kfpr!{WL2}^(t zrELtO@q*LR8{AZhAuv%_yhz3MHvF&}%1lK%J$ou?gqKj< zhq^5vLLX(FfK;4ftt5^Na^3Bl6+5B@r_;`|i=1?aLY)qUq&^tJt3&h7PEp{@OtG!)jJz=y~W5=@IqWMue45 z`U>OAV2pb|`3U2M0!+ffaaWV_EWD7-EoApc7HCc@Z2Qp}f)&{WtK8S8LDE+ie|0J= zK-)Fw*X3{+)FZyfJxm;dzV)Dua*Hll`nqw-QridS&nq1J#uo^k_~)6UltV$s8A^C3 zC=iZTh?cx;aYAB;<>oj(pvpFR^81`KG-fD7_NW%+>IvAAx;Q~#{DeQFI- zzo)de%)t%J|Da9V+Du?!HTGAt^&x2fX3ukba|z+&9S}=S7)N%u4C(KCDuICL!p4yx zMcDQ!D(9#>J*-dA6TLCAg3t);!<+SI5VDIK16(F1P+jVw{H4uPpdcBx-tjO4k9p+S zTqc^3OEk8*{yi@-Wh?Fb9KOxSPc6~Znn^OKCj0q%M?5JHu>bkg&kyYI%a;t1E9Q)F z{MT-1ylfxBbM})Y=W-kNgeT?l4-$ur<9Tva2c-%)%~?HT#48EN1io=ud$7VHf?j<%XQ1H<~2vfmo@rJ zk27!5&;a;=f{K-u7Hr>cm&h1q2JFUF|`8>c|S#+s*wyRwuDvQw>d+tknT;2_~p7=E)hN~&4x0jFI2zk2=-Zh?P%%le-nf2ARd)^9{4?*7@70L>X*N z)Kzu-)tKF1kNxM?FXjAq33IM-C}-`IhSQcVDY`fcd-OgoW_}O`n;EUYS#CCBv56n^ zXEipl8C{08?eEG!OGSOx-@^jfP~T#-8#RU;8`Qe(RoxF|hpQ{|X{j(D3 zoO!WE->|Hqa5E9ra5=@uPGqR9V-?aA1 z6ep1z=|AM0|EL3vp@pUYcr;+tVni9GIe@>n?#aHE76)GjpN)`5U&e|yzr|iV9*JcH z4p#b6hG9cupSxPmH((d9ew#N}BZq`nobcTe9!A0sMtExop8~=8Cit@i7m?s~5gZwU zZ$R{Ji9Q(7Qy}uQM6Qd-8xVIeVn0jl4T$$FF=thKeNvnwzaVKBb?BbHV1+Vk5o;qJ zMu6+}>pLSB^3bu&l<751D^eg-$gFgr9{cQjCP`}Z29`dYv?kQlR(C|SxlW!&9tNCg zI;uV^1QnV!vuz)AYc%nCZn`Wvg0h!hSKiKD28As`{4Td|g0mOprE>Os0V_~2U20PX zPZtUnNW3m2Nd&n^W6BI@!%7$FHw?htpWQN~-Yt@OC_>VuQXOv4(#14nUj;T(V!ufF zS;X`9F^5;7C=~F&?)Wcnkgu)s9A-1p9PVt$Niobq9^7`oeShI`Wqs`GKle+ z7w;CA2|!iSo#~L31|+4vKwhBm1I}K;R8ETy!7=qJuRIJ({_}}nD*b&LI90ClYt55* zC~8-N?tQ04dLyT{Bt z+gEmty@APWi;cZ!Oi(pl_rJoV`ER!nSFZv53l@JNq@js{$fI$wU9*0~D-R^vw%>5+ZJ4&}bdk~}^reMQ-d>_8Bb9uUm) z(&~Y?|G155WR%0bl1Ug$@-ut6>(23!yrFtUv%gnKKD#l$BWFTt*|CVrHcK{xe^I4F zwjcfslX{Y_+tJ*N^cW=X%jA7X1_@m<$mMdc>~;Z#Q-kXXk!H>U-rquCHN#wOWNDS?#T~{_%sdW@ zFhWQBA9lkbmK6aV&pxA*!)dgs^^Tx$s!>_$V3a^%{UafylLm2>ZU_#3JPk+pl-n1F z&7=R&El!Cmj(|sehbt1h03~wG4f1ELXw^D zkB%!6fS|@*d0WyYvPXr#Z~%0mh8|N_SRcHCdHwD!FU_=|4Gr<#s}1$=g~W52GJJYa zwY;!oX1_AMs& z69#iq6r34uD$u~&4?$InYLLaZpiMe#g*cvgiHh8y2eyUYCp-x(M{%z>?z_c3jJO{R z_txM(1)TGZ^Jj4`BF^i=IWjok0Po%6eK5SIfahoNTo;}p}kk5HfyGTqMfpzl%sZK0gvb#pUz|fu%eyzqxOdw#2uuxpq&~6 ze=15}{ZZe6My(vPv=}acO)ePzeMWc<3oZ!QT9N!<#>J1dnMvN7Dj`1GIg(EymT<}7 zKhF2k+SrrF4by1Bm_mqmpdnP7RJCBVM?{0pUfm~gWUzCRJ;h)AN&OvZ=K7fOFK8+Q zNd0HMkN$SIm}S(DfUYz7I9_|VhTa~m<*$8k46>;8w@y}}1?vshi~qZWu`(>2<0I{7 zUnD!QD3kUE0erlNq}MH=f4NegKXm25$mq$cxM!pAq8=)6MX3xur1FaYhUX0G^=|cW zWGgAxa%|+xR3Ig!Ql?)La-HN`7u+ZRN?(bFGS0LTjNiciyS(z3zE#7`oxS(3CDfsg zNpwxTkt*0f4m$4&iaGSkqS`?y_bKedbB^rU>w?%HI;o0uQvR6w`L6L?6NI$(mp6wk z=plFIAB?5aSYVFZ-=~(m-a_L8?IEp}5ay;=szSVd9^1M9a<5~(1v*(Deak5U!n)kE z>l$NBFd0uPZ%?uoDDq-xyox4*eZ1jcb)!oS8@PVTE9YMg6z5XmsK_acRmq(1F$y~e z*(LdT_kI%r96!8T8!xuN{OpR??ROL5u)IC@{_0;SarIrjG`|bf$foHh^!^;w>{oj> z4NF7wxcge~n)RSZL2GlO3=NjLwN>Zb>IDT^C?*xpYC_RhME^u}Y@z=alQKqS6RtMKb=Lwcg^RLRL@kyL#73BBmFP)G1TbV3X19V-GkV1ASK6 z=|W#MB*59YOCF}e&eW=#7(FQk#)E@qvLyeSQ{Oj&ayKP*`g$8AqV^a(O$a$)Xi`HM zN)G{w=Kl(`CULH1VoxwaGSEk>M7d$xV_+s%jmA*E4=ufYA}BWZ zIqasnxsDi<%JP%h46S}vo;1rtcH!AY~p%oALZO;+Ez??PA!8Xoh%nBBsAW)Qc1n zO?I9qSawV$iP~uZ)>M-RikhcDsjk1y8s}T|99e_S_kea-yoloW=!vZV^NQoXTinBl z`@wK;4enFGIo~*c7Uv@3ye^z0gYymW-Ywn-!+Q#NeiqMl;dul64uE zmVe_#!toXOT#=*YwHLjpiF#lih}VVwl6TYBEB6t+ zQC0Hdnqv?ser53}_~8=FK78%A`E3axBEfbiTyU7^M9$!#LCPCAmAy9AB;COtc>H#0 zChcdJm5+Y^p`Ztr7sA_qv2%hBo48_s(wrq@N>TMmzY5nCxO>o&eh;RmG{q!{8B80v zAA7bT42F4!&a6BpgNoydZOw8@Va>ilLK#~~{4TH!Sog8jV`rKr}y@#UMOzN^` zmX$8TZT1&^t91vwpgh%P$fSoIWCy(!HP%6T?9|_Ae#r)BxZ*GS$LL{Y*;1I_O$}sl zHgjK-FB^<5H>BRUrH#!D+9s#~Aw()?IYB!-6fD)I&+je@VRyZK^VD^?AWoU)<~wuh zV3wQlSs3Za!1tyj^HAa#+O(iPSk?;g@G(pIehjGcUtm^epI@nU#HBL&B6e4e zq1jXpo6g9nt#B^^C$3aI`6i)@(C(`DMIMpDqPU|a!jOj`_CTeZ!%Pc#Pf-%wwZep% z&)AeFNv8t#H(!sx3ll@;iJF#Fx>A7bgnvZ|1_8 zXy|-iGK3sXG55>SvSa++Po|&ut)nk<{%At*1Uz@g|HSJD#{it^L$}v*3Todw+0OY= z4K%9LS}IWX!mL8h$1lCiM%`kzUR?}$3k&++r%x1aK*=J`cKe@#fb3)grT7(T$VL8$ zWd^-AkblW^vfgqB7HS9|WzXq?Rhi;v>oj}eT84%fn-9xS;oXA}`*m(8wU9E&G)Dv6 zJs9TUpA-W_!AaZ(!Q7BrlJQ=cVL#fX_2!dt^Gj5QGwSZwBe&2I|Bvbk9O)?T6~}$I zxQ7w>u0V@z!<{ad(YeM^#YL-8Ey@dxA z&{GBb7~I|Qid0c=McLLV&0|Qu+rZ*;+v6k;qm5x{?s<|QtZvHM*^T6_VUoQ6m%e=h zHVq{_^r4gif!P~22z?h^F2USL;vynh54|=TFJAx+mImKQ92vN@vxk?&H$Wz+d3U9m778tfJg&!(L9ZjL4Jb*uu8H$3oqAyo=qScC=|sANMQ~pa zPUF`E+{jtY5O-1Nu5aY&|K795FB^r?ljf{>*5Pqgr3E;n^A3B{ug}EM*|$*xTI&SP zV;nS>JJ`W-h5n#Z_eGP358&6ikYqz?0~mXX5*aqbAk{e0YrAtdp+@t2S3k9+L#Ac1pZz}E1X#IB zN#{d#WMcB&QSE#20Htber>MID#O7GWJrAiPp42fBCs*RYZ4GO4gX*i`UY!iLT08;K z>G+&$LX!xdhL-1#(Od>`QKE|)bTUXlfK>mUgg+=z-uvPl_7;vdyXUl%Py}ZWe5oqm zS|sMq?w+{hxJa<(?)$SID*zrA_;kLTpM>9@V~V>a*$SuRdpp~nmIZ!c8ai?e3gDUI z*JU{gabQ(1bbpELC#)LdR5Aa$ourYR$`$Cf0e*(#LrRJ~(A?JT3I$z#$Z=2a^(CWv zcz0W+xX!o)Wn|;^{1*5Ut`nC?KAg%0d_OI|y|(#_SU6!s{zRVUw>UumUV6rps}hiv zM!db6$}KoujwMS`H3A*ptYV%!04k}ku3V-H)9irf|eJgu{3tKHuuL$nH4lQyuTqxj-g<^Lz z7x-d1F;H_|aw^{psu@sQ+$1Z8y;bI)S{ziP9XXUtKUA2(LWwXZ*O^sfdQaK%Vlo?O z=lINNa~FgA3k-h4tP^Mh&q;@5i_?IozR5t5M;5H;crOXto&ep6&vMTnZ-JdtPW2Go zX3^z8pDWKC5(O)_VzLRJ^r3<56@M#A1F#&tqtvF9gAzZ47)o6Zf!ohc8uSd@g*}-2 zAE+*{LUVtXZZ)WF5E&-Zv^Zu2A?6GQ2VuS`!b46jaf_HNRKd0ERrsq)c*Sl%*hi-q ze)x7Pb#Pf3LRTo=ZMWq?O}^r2W%zMGoRP{tqAUinz7Q0VTXlm)&m=x%EV_!m6nr&* zNXi|R{2?dxhP4_^Cer3He5D2*-NUS#A=AV?ZQ)lrqnm_iAO9wGr7hx8UyZ`h(*+{# z6~}$IxQ7wk=|MkwZU~xl zOaITOp!l&RicVP)Djl2+i!>604$qI;{l`TNacmpD(|)^ib(4H1y5i8SyYvn>OPE33$rI+OYXV?!)Yn|$tPs$Or+RpH!vlz&y~(^lt_F6d z4Zh5vmx1ce@z7c)TaZq>{ZgJh21*5M)NwD8piDctxy~p{Eu-et@;Eyr|?@>}Oh)MJ~hm71QjLgG{rYAQB9GXqm+@*2`dYv@s$GJ{o>(;D1 zoQhX~ORY(6_4Iil_gURH^MMd1_1-%nG0y@d2?_t@(P%`sJa)5`n2OP+FTreS{Nli7 zgH=6TlOOb4{;(MKiyq=-*Wa{#gqh9)|l+9*y`6A=0}_wKgrp^s6**sbsX0Y26_*FB}V2?BoFqml&Ib zw_?$Rb>`wL6~1tyl9Pscgd0lsAS$S>xd2t&-c3|2r3b%)+=MtRc8Tas&~@VM2Jv?9 zo~~f-GV!iXoeJgHCh@S@l;_EXc*ve0ipdn&LSDaPrqAC^f?NX=LhWX+LO<#br?6))v5s+I&fycE z5sCM&U)(6-%WVx^o6L~N&5i|UlqNeigGgL5yjr-?!b!j z`qW&t7SQe{2gPI;1WPwZKKV9W0dH^~r<-P4AvEQ4oBvjsBYc^wT&7~>hSE!({yv=+ z0xMTvf9hEC1wN=VaT2+98Ym@*!oLPjL$nN3-7-q^gcA?+Ew=ZgVZS3Q5+er_@YtFb zLG_6SXk2#H3>0Mtg4T|P@9PMVGLy5x`G?7`PiR5nnNUc->R8MNqrFSz2%1N678YbEckl zfpD*Q>HmDUxQ7wqYDmxP=AJ#}`7*5g2w z=Y#UXrR#iv{a)%VuJl9D&~=;N?He2r*Eu<0BEt_!DyJV9g#|&!JTsu1r_Vy>bN^ni zNQXh9hmLW}%XvUO!K{}9ClY~EtF5P^_#L3>(jTv3lm=Lq!_M1o#zE2%EsarIWY|Mn zy>4+rEJUIByXlW~9HeF<6=Rl5hWS2t_tX4XEOeq@YW+e`H01T1g+88z4D(yQDyR1_ z26FJsjZYx=hYm-H+YKKqqvjv#Uz)~xLe*I3#mJ{OQ0wToukTp9z#vR{JByzK!uCd# zrVVMK@$4F!N~a@Gxt2u}Gh-RLM=MY>zjg;!OY$V6Us8p%znW!RLj@t| z9(mhcw{rA4tAGV3DR;Si__9&Xo*{J2>aJ^Ny(To=-^))YC(n&*5~e-YXkzjrMuz6OJ@Oj!eYQD#TEEh zC9ko<<^^!R%P;05^-Yw{jQvftfHSPAOVw&EHAq7nw>KD~9dM z2Yuq~k3$9SDH%f-PJ^V)f{K$eG*IV)f9ajR2-tCxrr{6i|JU@BcasK|KJc=)p=)T_ z8c|o?;^RuoCV?#>w!mX#hBy~cJDnEtooFl;S6y711b=vB*KMMA0}U*4WX+%ndW<2>={!*nR-a~b=+GfY zA6_jLwMAWE9y90X`b#?m>7h1I!LdZ-+~pMIy5B>boO+?e$NZ9LAO3`4tE!CJI@SXym zpT%=sc-{cNgW>yGd~bliXYo1ff7xfVCD#OI)h&yZOpYh2QS@i^rci>qtxSjH;%&ld z)s4aE)I~zoX*U1R_|qf~O?s!x!xPZ;8<1`0(F@>RnZs&usS$81JJHJekqs)5es%Oe zKbX9HM{1z@5fGCJ{mI@WgA8M@oBqf7E^TPy(Rs`UM9Doa>U5k2))Tb zP8xXW=4q4o2Jn$!LMEwqi_Tp2YvnuHPJ9p?!@?Z-m-v;bRgp$;fe>-8Y@C$qf?1!I z3X$>#XvE4Vj-Bvb0$EP@sAo_B!Po!AL}S=t5DwCx{`a0G1ThQ!H)pxiSbplP^(`d_Z`8I0aL40&A zbQcW?2I16vvuH-{>h?Q_H}F-$P0sh4+prE~Br#gTifDXn?NwM1!S1p|pA>r<0~!F| z$MnT@*dyK7=ajVw=GEg6aw{MP=yUHk1m4?#4MKAk$=(WJ5f0~mnw7J4+TCX|}rlCstD8D_4sRVvVLMT6MdMEeRpp@I_M;X=26#DOb$AxzvG zggvpIdjYp8po~U`qe`~Ps9*t$-pS=qbRg#LnX$5maE;L$Hhr&3w5fmW2BN?LjGf9X zdMela2%}5rt--IC{vOMRc;=Sh$(AjlvIIdz-l1ibBPD6vJueT+ zRn{_4xwwqpop#^Wr2L15>iefIU(SVOOxC5HBmba(&S-J&a({%`2q}`!Zze%bQ%B#t zV*G**NxY^rq2d9OAtzj{qz4FjdwF$a&qLwI{znEA_wU0Kwd$nyawJ?H>c4UAtp=d} zP9`XFj~!@czKZbAXn<$UsCl;gT47o9KmGh074SsxvhMNGGk|PmE1&WiHF%*i@Y#^2 z4USv>_`@x^3oe>GLN>rw3xD)zY5ADP4*mI~LMJ>)4ygNL`Mh0@LI|fVh4tqov_O|( zHN!mtHMO$r*DN~^`}!1h#*~Dhr_bfzqdzYS$yrC<6rnx}L_#79{W*_A5-c~?sSnqo z2l2PN9)5d+PIz=3maWZ%4=p@z(Zz5(95#rt4* zPXW)*;<+w7Z-C#y@ck^lH^ASs_?+E+S+yXzm<2zs=FVpjzYKSU187nFHmDJT#(r z^u;#G4@Mb&dM%UWt&t-z+3LSEg-kQ$p8m)A9_UWz_>VuUCtkJI^u!sOdClR!N8)vz zY=7?jz=s!dd+of%`Ia7t=B3ewW(2^#+23|#$+FSjDqCmr_E7j_1(9=D?F^KrUwkiT zh?HAXyCOu&8^B)aVxLKOFm&Vnw*u0B7R|_is&KoG6Ox$l6x_VU3ha*ZdziL$5|1ca z9qQf+gda%EUG3RQfCYkIyy$d`L!Ce5TD^Q5jOvbuZJ0gf!sa&5--X>7M8&`T8dSEc6>I@NoXQntHHC z7#^*UCkr_aR1>-5HCkVxDuO=DO{y_)f85c1^4uJ>u$skKtRM?bs@%Q&mrow-Th4d> z%oB!;iz7zZZFPWa8^4s&r(yKoivdS}mse=leKRV<(FT;?G^$Jnc)=xsTTJwUToA2> z@t)iRGDxGx^y$}t1!5Gs@V2X%7pO%K`cWQzfwqS9Nks48KHo6GVGJvs9>1Q9=k(S}%1R*2pM+9sJs9S-8iB#Xbz;UY4Ok$j=lv%LU} zs?+pqcZwq_y3)VZdrAIWZS}(D)I8AUUM5-SDTc65tjt^^aip^Kj*dPkj|J;#Cf%x3 z^ho<>nU$2JBT#8OuOOwA5Tx|g#mGng4JvQ+=Y%||Z~Uv^nt=jcB}{A6(C(6_4b5KB zH4F+gfC!ys>r|n$sQw>{o3Y(*(31T+fv$l!aCS;kaPAcy$je(o=H9Y7l>XeaJ?`=n zs?YH}>~hN+v?~5#Qdnghd{~6%9>q5nAilMDII@8sdTsUh7I91v=uO*<#VI|9l^$QO zusc(L%I17jJgi#;|FW#-wEcVojs2n3sc}I8=-&w5OFqU487bTEIx^@2-jCrIm+8L1 z8#;D=m%^ISk&c7k|3Y8GG}kC{1r#2kwp}BSUO%UT%mfs)oai_})LkgZJI@SXympT%=sc-{cNgW>yGd~bliXYo1Pn$Pc%?f4YdzI*S-E&FuT!X@|t z$9fKG*r6VApJ@k{Iq&}DSTh|`!LRsXxbqP5_r;WvINd7Dyxr>;P4bFU9(x{c_Wcxa z^oVJuC3zSxACfs=Toq@*;_h6;(Y7Bzh;*&>IC-QN38tE zMYJ8AW;c;J3|b8pf*+M^6I>n#-zM=5&|yhCLAU-ZFuxMp8&V$()@d?)rDF_%%)j@J zIQ5D_x;dXBNVzW5aBHadf$C$J*rR7ax`V-X3TNN_HYNkDsczy`*j zbccuZKXL#W)m)!N7Ag>7&G99vVTTZX@5SB^)$3>;i;W5Y^L*HYXNE{T5dxRSfBb$d z+ZVmMcijH@%~)7!{BlH_+XNaT+0g~o17Ksf&sDNyLugE4(#5a&0Bz3dUl#B!LN!do zq~42vg$2_%`o^I;bmdqfFeRr(y5C;<>H5?P2)e}{;bv}!W&dnnH~d0{Xz67q-Ap6> z+=XPV z(@P`1s6jF7MZM!kGSEF!7GFxeJ~TostX9HSy{dO_ zh!rxu{OY8Ntv&1#H+nPT)FU)Dy}wN@FBpwKO_(!vIRgvT8zrqKJD?9w9%mFgU_%&I zOQL0xMG?kR^S?i43L_Tti~h1N;t#?`QG70sfxF=d723myn#!6S(*w(PXGE z8$CN4!#CTRRWa zLpO5-&yn)8sP?Nnf-{oU@CL(e{yPqZu*b@e@Za2X#H3?UuPL%AAaOqZ%*%^Eh{M!o z@Be$x#%65ieId=+5NT-^;k-{Ib7@9%Dgr zibspCRqw-mnh1;Y8ZXuy=XQ;KM-vHBP z9>g?Fj((lQ4+|TdE@|BG1UreAQOvX4$b)n2Qjbn@U~-|Etj9M!!1>uPZ)Z$65c52d zbI;DLqbinF?gb^zAnRANmsIgGTyJu3$(+alSS%L$z6&oAoFB9WO&2%A3N2QRU;SRf zi!IE@|0n`+ zzQYN4E2zcTI|U)ENjMr=n1!$0D2uSF?13c=p$o^XhtShEYc152`iaCCzw11=sUXJq z9j0F@JWzqlDtp^c z^BFh<#c!JNFRju*GlL8bFZ))B{d269SDfy`7K-6M7*!xDYpgK({3I)^|3H00Tq_oS z{)Ssq)F2-1lN(!mntm6xcZjgQtoI0AS{LBevT8?5-WrF8XYKox znRnrVYJrQ7rOA96n)8R-`_;ZwzZ0A$bzq z=)Rotj&(L^&dPNhx(%(fo-XC}-A-JVs7PFkgcCIc^e z8ZxaA*_)miOmAkwkIF5bNaiVcThZlZT$v{9@X^UDY|RJuxghHB@sKwv{~}GEw~ZAu zvi$WsmG&4GEFH$P&ccP=@{G0bUY$a94=vhHuY7?g)I~E1T2lnvD~|haaStQz2gAKJ zxK9D+eB=CCoQsI_x^RvR&Nsk&w|E~6?+B;BXXbx$3gAo*^O7=03WZz2yByciUbA^E{#Z}}%Gle{$?R~>%y^5{avo1eD+ z<9vT~5Vy;v*e9}v{p|VcHA?V?82&nm6~Wh^GtHlM&4OQUD2+RRt3(qY-cw6Z6NFYf z$i^>Sogv=w4e=rM6yTFBTE|nnqtIwVRNY=L5w(E62bX*pB<-c)Q^(l;5j?fxzEW3>?N z_lCA{V_)p32X~fHmJ4E;Se5sl_7#cC@Sx~qzYnI;I=G6(+ans+KG)>`d;tZ?u#trp zdtv1ilTJ(NPDskJ>a)*RpF-`As3ernoX3=gYurxAMS+^otf+DOVBn|AbFQ)Z5TpgQj5$4_Luy6h&HY9Kpw&wy29_6np{C7@H*Ifau&0KOuZ)7uVEM8arCSDt zk>-vzPKQTPV2QpzY5w$eD4Uk{Ie8d_z4>i8@RL^)^P;khsK{VN+=ba&gh{;KVtq!z zC1EdM$aCexw>c+Zs!SW582S;l`RMsk&~*nL5;(FgdTj-5I#W%Zt+fLG35=txa*P5G z0vZeY-JKyuNwFDE13#$Hb@GDwV*zacr@&`Jzmr%3-_owy)L%I819JaQ*iUrVV5N6o zF%s+@Sf6jXZwEvxe+7U{BdFwdXx(U@C*c0=(<0>i9%Uj|J^Wpt5gUIVQlPOifW|zz z7x6Fm9cq#95~n5H04x1$`C|Pc2s6yj*8AzIi%q2{QU;WTV&Q^27U$ADF^!VPGHV}i zA}Kpv4wXR}NZM7EN81$M82BEO?|-KdqmcL|xrsgi(`HTdvP==k13S2}nzs-eHlVC5 zpDhI2+2M0+>fy*QCdd2kKki_gSwbhDlof)H#TTC*Z4E$YS07NlQL2Xf-S4%Z2rVj}a4t8H}}SW%tx$e$KVq-<4nb@Dao?;Cd?~hB z{TuqiEwZ#kTMB!xC#^UYB8rTER`oi7<*}Nlyw+&b6)=6FxU4!W5O|31wig*(1>pyq zCdJv_fZQ|&|UJhbwCl5qSy9A7Q(`B#=anpdxh6OWbxixVIa)Zi8ZzkLyUPE!OIPSYu z`Jaao_k-cy8r-LVbG~u@EY3y5d0jY12Im{#y<5BwhW8Zk{4Ad9!t)0B9Sq;k;(G)9 zJ&Vs-{p%AhYVQ*f zy}avOSFeIKiWJ)`Rk|ZT3%63ARtAFmR?hnmq@MTA1(KveUV2k`a! z33kgBp`Hq@J3V_GSk%K^_utWqm~h^cP&(1m2&;mG9{PYE$~!J?C2s_SoJ&WNUT`6RO4tHGG- z;ewnyMMM5;#te}h-L$3F4#LU`^#<8w;;@i8!YC_QBBqE)C`4Y0Mk4OcSBq`8L+i5P z4-EywvCm?U<|816LXkC==e_q1qAaxhr>_u< z33I7hYD|V8efAFZ0=q3x#&hllIxlOI&whB7!sZ6xkxw$eC!Y$E`kuNI6L~Q53)?NJ zmdCMQzGp)Q{REMnla-$qAKirBysG1;?T?1+-e3CO#iND!Txy;VI%1B|pI0+~q^N>S zsyjWESxf+*=eg7qE=EH!sh^7?OieJ(I8Q3kSToEbVL~EzK@gz|QR`u5NrX&gvQDO_ z#emH~MUKNg0bqEe_Di|lG&+6!XYN)V6K2>E^jVcajcsz*H3ht*L->ieznET60PIvx zujC&1K!vD(krKNvq`l>Co0hJCt*862h4*S=|Bl|QgEOfR2ggTlJFOe&s)YunO?Dix zop*A0z8nU?Z~u_E1_ux)?H);$7Y)FBF0(%_|4@bxVxO`?j$#*2424y<%%TUUmFrLL z_MwGYBR54uI^l5R*rGMyjm<3Nd)FQ^z#fd^8n3ML2mTTsoSUur6^;0wjNs9fj zGqjnGw63Oa@ONWw1^RoO6_65R)*ogY>R8FtQ$rV0?!4t_=r5keSP(&4-Y}FWfFY9*sx5FG zFl8Hj)6u8J9{jEK?nLiQm}?67vKZty<4X zcTPkwzpYV0=`~E~zjv9m7s9${zg3XFuOQ^{l{#tK&d72X0~1HTA`&J^X`p$+9a&T@ z0qHUp$OykvZdPwF79s!4kk>T<8_SdYW%ex@V;!&6pfO5Bs@5$01f(g{>+^mMUKb3& zG#*na1+3pd&Wzo-k#dvoGV+<@b5J&42&b#tu zke6c<7FKy5bbJt$_{(656@;)0)NG%_2ydyT8x2K}Rj$ zjemKz?23=$(=6 zpn}`I@G-%f(?dS)h<;kzsJ*Z}A~O6FDd+M;&fmV-H>h9f>N90dbWezT6U{bUrhuGj1V0Y%-(=f~r zQ=;K?S~LxW8tt+<$VqvX0-8Xkqi!mguSs?wm5&s*>x|5PveH5B(iI5HK;_Vw%Xn~8 zehyfoy&@=>WsChzQFIaP)W+Bwpc6#j%+Z+IAmbFxb^Yzvfu~_-uANmXR7!pBTbpkT#L)~f&=+xG89tv+(&bN} z#go=LRizS$&d6=&ox+ot{MP*KLETXBm?N?$!#)UV8ILmyTa5)7^bwavhq(~4&;^O9 z1r_X#=oNo2I#DEuQEjf5TM2ulJ$vi)SPVGk5iqRm5epv3j99v9gaN0nlZrPqlK>id zl}+O#9d>p;>z#%NH3^|vbKt!+3A5bHxu`Y%552Y2;wYR&h6Dw_i)E-v?hr6=L_c6zt1E6o|D%|_C_Mf1j^V1j}v`)*C(!VVCbPxhv+WBTTRP6|>!+0+|gtSF^<%5ZOhF zXC}uOv1eRTlj=uwFr${ICyXUD5q*AN=we7Dn0}Lc^!YJA=+R7MQi78eFz(PxA>~Ps zBvA{^jiNPpV8M5ow13A=yEVREW8p%ck6#{jH#mjG%rS6`xlv4^>0Z*l3@%|K1WL$R|l!I-3> zuRm*xC-(3vB5bTT15MMObGcd{jAhKQ)UH~(V&?0!55MnELs<`W8r}*AV^>aF&qSwM zVtnsQX&QeFLQ=){ZF?RUvAFw>P12YApe94j?}u1)KtOD^VA^wX%#@$%4NdMQsxiGD zT`aze9uKy=QB0c%H5A?!8mSI}S{T24eiA5)$(pC^lZEnPH;*5%aP+fbDSYJ3Mpo%i z&f6|}>%I`^M7c?xN)G{3xtY^|emjoM({N>+mEA)F1=5xRroEy437Y%&R*{I+|GD)DMU4g^Y*j^rNLeO+V4C%%ThK6Ng8+U7+grXVP3gcHl63 zU74G4EF@+jHce0x!G?ON*%OR*&`MwS7b!Gk*bgxWuT;?y)W`O^#2rmb=##r{j+{;Ta(c8KmrQ#!^1InTU8`3IWLZzy9ZSKpii z$zp_7L>_fWxul6%bfw=8`_73Kzs|0D_Cy!^`#jn}-9j0gI=6k(W7h^zrMh}YbkGS| zeV-`SSL}#H<JF$wXHyOBE85|CIm^iyZ?8xT@xD_u|Jn<*9x9D@V!X3 z4MCD3jRXE{sAAX}m*2}|7qRPLZ%0I6X-v@1)5S3kmi~t!tdIf z@U>A31aM}gtVo>$Q+E3$OBb0jX75EyioGqA;L6pYF1ZeGmN3+HU%3h$_ZVr5{+JA0 z9b_dtNSq`iv7v7FAM6;ANSme!WJfqfj@HHgxB}I&kon%@$OI*dcMf!o5X>-BzgFiH z>CP}O*fP$;jnpORv#kp9LOH&zr(QXjLB6$=oh{ktA-(gwi)(*2;m&Z|x&fLg^q+iW znPTfA3=0hG3Sna?tJC%j(|s54H<`XA_nZ?naGRBly)76zL?swfz$b*5oR;i8JWP$T zD#hpPand73t@QQV;@haCdbn=2EE~i=gBTenIRHUN&pgHuZ}4O5x@P?ob8v|wP~>9! zCQ6n1-mb}N0{&y(LN_gT2j;C2;|lzLJe`L-SO5S2k(oUsva`w-@_KfToxOKtBuXWl zO0uI6qO1y$hGa&0b`G+Vl$oSZWS5mK-`D$h{m$nv=yH|kb-F+9_uFl^1Ix__+X?>A zh;VSp@$ryz14Xu?v{mx=tfrOZ9$E4`8=W8Bt|)mFz4u1tx%JC&Oxo+OJ89=CNNiB( zD~e~qg(rPq-B}&PIDSgUtB>Y`Kr)hIp-}#RUU9;AOL!OwKN#VyA$$r1=bPZq5?n-r z*F|t-2)+TayCwEu#7=?e&l0^ZqHjRl!H9g8$PI|^S>m0IkDt9iQmuuOQmnNP4fCU# zBXL)a13e(@+=)n?`H38#b$N%Nis%haN_7*eAZVC)hx}e~qOaMG7rZ)7?#p;Kz5iL0 z4r8d72mF8xx&Soqt;q62-_3okrtu0;YDuj)GHL@eIP1Ofg?#5o(p44=^`$^BVh_B! zNbXP}zI{pirzs5b*8YKQ(Q=*@{k&=_xn(+m zB&k$s(ck1k4-U%pdL5!h^MWMz^7tLZvotgse@m+1SXi9&fQ|}&_^RqWvoZtfFp?jC zU^EVJoPPgqpT0Vt2ktxSlPJ*$zSO*UiR(bmC3um&Kn;IYQ~P{{3ZEQn?lv!0uoxAmjKqjKet zBZK{j-$sG+L4G~>U@!F9A@eLXubivC81xT2{&M2GA-Vr+T-bVS(rpN7{5xv$)?5>~ z=}fK*b-f{Z9ojejTTc+0a8)tHMaE-JHeut}Ue;jc0kX2aBbC@E9&3*A;(SEXN+7~v zuQ3#y)4h1SLIT)N`gXP(9|zu74VxWZN3hPqa+}+^OITmj7jmqeho$CSP0y&jg}Dkh z=bUOXf%LpSKWNyPLEpjM#Z^lHl+FxGhtzD;kM~9@AJa)k_M9x`R2J;SKB(WlYrlFE z8#w$(H&5;sCi6b`TjP@uBztpi_|#r~TqGhk96*`fx z8!_k0N~^!0+`!uNX~aIYk0VD}Soa*QoW|yuoj7XjSiym*0ec4uPN=w4busBI3Yo_J zzI1ZF!KP!&a|*P7VY{ONw_*nukOO|N4y>JB#dy`9H`<*dL2TS{)_dv*%*v5mKE|a3 zMH+Dx(+NH(98&!?%&UgX_iXzi;Mso zBzRo}M~2`V5W8Dq4@T@1i2f|m>mvFF#2t*tXNlZ^_?{)++1cIYTm4i#XoZZ)1>-Za zAjMN7RE+W;w!8T5!JN(^bgsp^YWvTAptK$48o0%QFIN?dJ6#t>M?=zao?!7XQ~V2ll_Igk+v32Ik=2DD`s->d2Jyhy1#erC-#HS zM!blv4mI>ru9mo}Gl>ZnsZA#fEMwog%1W|yH<4dg`y`XQe%9ONE02$?(!$6bnV$VB z!Z3RGu1U+KF64ZaTEU1YjvafrNBh4%iO{iMC;q#0Cf*jgEBML@;7W1BOKL}ftc#0X ztC>3}4?gDK7H`aq>hAk`LwZiW$I0erFx*m(I`h{xDeYGxr51C`Pb0YdxO0y)C#0xH~ zKbC@WV?4!orr)s_h@ps7(8s;rXF!a2t3`+1JrG+B>zCgA6NEC zfr=A}6drvd@Q&7%zGNRe$i-6`C$VA%nU=@&$3ktO#SEL6uZkP|lD!jOC3*?4UFb=Q zdKm(=?(%*{$8*8sTHN}SnkLxW_SM8Pl8geGi@r9S8~{`$@VVdsGFRKEf46qs0i8&q ztk4^~1oAy15V8Rbs-|1+9(ZJrVs5GikIp3m@s;`ejN&11P~`w`)R*IAksO(5<{kuZ)?uyR zSi*3AUf9%-A@E9u8_3_Pbma&k@8vXk>Q0%nLCY93DT~{*FhEY0;jqaSAcwMNZ|GkK zx6SIhe)MGnUzM{to^J!;_xQuptxy>5pBifVda(mLJH8jHF&%|<4(|&NxVyp#RlWV0 z@5izG261E5MeZO&s36j6%?1up?C!98?qY;jobcTe9!A0sMtExop8~=8Cit@i7m?s~ z5gZwUZ$RvBi9HyxQy}`YM6ZkJ8xVIeBA+F41LAv@cxQv}d{7fJUq(!x$WG9&h{3WN z3b%(%+K@CqEYn`ahK8jNoz{1=2W9gTLl2(E!RzId==viObUN0vqs7e?*fTb*3-*s7 z>r}VTgS#D==NrFeCP`(et#3d|Oi%)G?hixPd=$W*KhphIj#HvH!sQu*Ima=jk{d{# zy*`N7va)o3qXBHQ>!Qj=tRP1uN*i~104aTnUbnsc0sH$_=%r9h4tDz+;K*C#gw#t0 zul}saLAYMOBDQ!G{?;q0TLzlY;-KS6PmVUE!BYt-xp@bpxb8a8dh8XVxsKTVCifas z>Gbp3t2dE`XFt2<3)iv9uz%F;D~D0xLx)rs6j<=$*)|L7hC9FtFT6(Es)0Y&H_~+6 zCVK+6Zk`m7z5`y?jn#)`lW<|*p~$oHd-10&C$wIfW`dK?WY=4=kK*S}vD=y?2;**7 zVr=FE>0RMQUy!Rh70HsmQ=*8=QNTqAfU9al*U_T-ay4f|05ekgHfrs7( z3la)iS|)S(9k$h;KJ(}yo%^Ti<1O_MEC05BNBH8f>+-pUe)R_C5;}{@#hO|78-4ARGM`_lZ7wdaS z;s>oM-eK!w6Pa(nzCi@#d*Os&3FbU-@u9d_7Dg9Kaoc)M8PHn)KELmf7#!zYycx&8 zA6$~I-TU&_99F3oBT<|)gFGDMppqLfArUQ)-3eb_)QC6MS3TOUSwB&zO{W&}`?{5@W4IcCP{SBgPAjtQxh`X$P(8z%fWs12-0+)y`pg%`x5AG+$w_rD!+nb$$aXN0@`|u zIb6pETDzmu1HK}QTtm zJj$P#4i&z7gyxJp#sRPFKL4B>%n7-|0)`YRjzO9O?PDwd`jAiVhOs;)_c2o^gQfAN z4(!E=zdi25Uy!aKJNI!{MR?SQU*}5<*(aCnY(dMY3{HQtrN}8-Ml{_9mz%b}A#4xa zMW;32U_CLA{4ea(*{Me0XdY*{-GAS+btbZV1r%0Do9c`KuI6vCEFKSkpN7vEn*#_nxMm7| z&wW)ed^Ltqc<{{b(`d&$th4cuclrfxL?q{Ql=xse333T6|%}o2ir; zw6oUaCUef#OB8l~{e28aeorZge^L5{xNAwr9-aP)ZB{0kKkH(Fn!Zx){HlNIb11KA z(!cD%MEgHwpQ&Cz&R5NR*E~wj^>n(1*Cfa47sfnpgva&Pck0Z&G)kE#weUp!^}Y%* zD^PXEof^YVj416GY_uWc+d&ai#&uYj@#jO~zN+Zye}X;1#vG_zc6skF5l!?@d0UG; zn>>0H>rK|XZ;5jYWp7(PcE^4DgT%9)opCM%o-Ne0M%~zT=w#$y0rxvYocdl`UmUta> z`jSw~xs9iw*Vy~?b76H{Ju^G|Hj@>UYu-IGFX{sKFF6=5Xe?vA2t!Zq`EAVPSNzOX zx)tQ`U`Xoc7*BAy(|SS1?ieWdRLbT)BZOa%7fsNy5W(F>kL;s5w-;UUwD+lBItnNb ze7yJ&p#kA6+o?C_CGpe^mY$MzAzc5}c+fqoO+>c;rEy@<2{7+|JUbONg5v8hopp!R zV5{lXsW)o<*rS#&jwwOCnBPD9rUQAySa6ZutuwJ-k$-jpuXY41$$w8N?!NNdq`D(( znO2PYU@9`iM(R8Zo@Nrrz9qqoe^Mj){LXlWJ(crKo_bz^_0`hsd$3>zvqVZ+_|F)@ za$%pn(Ny$++FQ(tz5F;V>^WOnhYevRw-}ob&aO>m z$U++Q0zx_64x?!v;|hNx1aTVougmsDWG;|lHuXaGAv`@wLh=Bg8(I+8`aR0U6ZM$y z7L?Dm#FG%a!@mSCnOC3a7$lFw!&g#v#&@%;_un8)0+X49L*1$n426fw@1`p15zQ;F`~^_y@9~lVycrrH*j$od@657nErJuTHF^BfQ2jq!8i2ml7JIX99Svu{%Sx)I9K?s}7AyyBjNo|W z!6Su3Dj@c+dsa;U4yK?ZRZ#tu34e5C-}vGk4m9;}Aadd2Dfr>8l9Y3t7VuMNwm7N3 zi+%5NY?OV$j?V&78Q*puR9}Pg@Z(%%C?4fb+FK$4*2H2mzR@0qhTUqNa#yC2M?UFm z{P#X!Et5MHj_OMoHh(KEntB1bId0E;S}G6fZN4`9qdm1DHTp@ zyk`%>D}db(d-?%+4Dcoj-g-7t0Scd(?tNI24aeRIoTWC6}cS|gLR*Y7x9I>vJ$@oX9lx6(ZzO`QlsZWqt1 z@R;C3Ydj+nlD5#Mmnk^r%qOh+pYWUPD>Fzu=MAm(6-~I0*>6JZfG*4%-*#2fQUc19 zzpfT^{lUuH)5Zc?c~PV5HoG4S&7oR{1??n@4fG2->8jSN0#jHRN6hvwV$us-f0D7G z6d>d0SDk0Zr;vZ|d*A1M1h57s#?Ssz1*;vYMU|1vV0ByW*d2LY@V0t@{~dZxL7HD#MQ}L4P z9y~VR`k*T~2+cK*(h^~Q1opQaofE2xga0}=D@xNm(Wl>ka+prs0kM1gYh{DBu;NoW z=L(&^V(wz5|n z79P4-roJESAFA4aE>Z{F+jq^RTT=+vc6jBEu+ri#Ro^(M%%ULEV*_1}lhH7)U3SnO zy#mtqG6@T$Eg)m0?MEdpU$N8!f8ExW<`HD{+fqgbJ&dyxo{T5^TIm=f`6|9Ez`}b? zTMDvr@B*ImQ;>}kS zginFsd=vaxf{RG-x(JR8!8agwx5OTd*eMYGS)$iP^bLqR7?IBsxdHJ#OT4p|g^g#& zejP^pWs?NoYXyT13N;Ro?+;<0!eZUY81jA~QO%2qH3DSk-JAMzV+%2@`|c}FGl_k0 z^{cNBHG#YP9v-+%Z3SXWoF$V&bU}^x?OKMP2T<mO3v6iyIly z$X5X=t&|6Gg9vz9DsbrGo$L+A#~k?Hw_@3M|hkKkgmb+bF6VvH7$>0h}m2Umhra6cj!nmG^z1 z!y7!Txf-$NT>M>yHuD_@l`~D0( zH{so_PcaC6cHJ9QsE%JO9u*z^A&OH-GN<@_U`5w&*gMQ|siD1fXD@%xbVpx&ZWW&x z_CwLnxl^UquR;2cM?c^8llxRcW?sfAeyHPqi$kg3+Jfvfb z_iZ2>${!~UKIi~l_b1!i>#9&n$7!{!PZ)~l1O_ypFanK=t72K%Bbcs?p>OX_F9uKb z^>@{ixhe;I1-Ei@kcM!v&1{P%%)nUgZayx6md8l_DEZ2Z9)7<6rqBTiJouoR%3`5B z>cz!==>_=?#>7#YA0TxSVvAJA49K~`sLo8L2>E+99DndR_0MT24 z(0hs9{lP=2@q^q0*7#SS*U5of?LO1I`2zt7xuJFkPtVsky$LZN4D7=QuQ=hmB|MCT zAB^zU5IzNh^G)z)2`(bR>moQZ1mA$z-4c5+Vy8g#XNg`H(KjIOU_?GkBd&cz7ax6H$u=47Hyd3(5+JaNac8-Jd@4 z3t{^uGjqS|9cGnA#eK#}2Tf^iyOrdz4=tu{DVLZzj&j4^(lI?yc_ck81T~O zyyKU_zq2)yo18uX%xl+&uX@8zSFyGQxh1T8IWcAU@n5VG*Kdzy%TK-dF>J}qs{7l~7ueI=M(W|d$rul#Sm0ilaAYpL?l&`~7E0aO z_TDM?Hx@>bH_jQNj2`ma>dJ29Mb&xj_!`nx(Dthos-kL=cwKvwskf~Y9yn#gVeljX zx8i+zyLM*|zJ}iwqP!i@r2RjQZ}9}+W2AH6B&_FP%bWcN=(`qdCIo> zQ#P|O;y?*S*^C7m)O=>&fr~bHT|(uXxA`M*VOi|gnZvu7kn4d>d)o70)Z^pseP9M< zjn`b7-0AV{qeiltcAMDLTN);#Fc$Q2KzsBl``du#w8x#Mlq>MeE%gt7sAchU39=XP zf_=E&Z{Mx1$1-Tm4o%ke*SEn@|4bR&`#R*QPhybVl*j3tH!||vejw6AepWgPqUiXc zXMJS_WUljed*!oCXP~(f?UI>x6tZdHtOeo^v8Q)cwA@xJu<_Qn3Q@VsNCcIg2+3&^ zTMJC@y*B9&`Hy|=E;RQ8i(7w0$9gV6`hGZ4_lXu2Ybg9!ZL>}0kxvR+H?1NQa@6?> zk`#DLJIB#JP9r#=C8_r7p$&Y`QapI1KnHq1K}67Ae{7lU3Mx4G;sTI2$S>Ky-D6ZH_5p{s8#5F-qZrnAg8Ljq5Be(X!dVj z$`r*rOYB}-`d0xqqpN|F+rP0o(P5tQ*^AJ){!h9cgDse`SVbb$N3oQL?>XpxcVdKB zobcTe9!A0sMtExop8~=8Cit@i7m?s~5gZwUZ$RvBi9HyxQy}`YM6ZkJ8xVIeBA+F4 z1LAv@cxT1xXAbGPs-W~4cLkNwYoOG-{#&1=h42XX$@!<>17QVg>m$*RZs0v-qsH@w zWlZt2I(Dv>3coZaJ-15s7aD+~3cxnte9ucbX7i+=jMxPWZOHE@D z%DJV}ZR1E-?K@GLzF4SD565iIX~Fbpjoa6(Zon*c7hd0t5Qw=sPKLb80p=RH)K`-W zKzd^c{&ns)Fmd0WRj$4Q%lB~buU~V=tEsGn%zQJUaItHGb+H{BcH}#`y5xqx;Pf|Q z%DV;Gt1Aw@=XwG!Brb8+NqgW`Te7QuzY~DLnLJ7t_aL}cy67KK;*S4Qo9ghplms5i znVI)+Ph(D8NjwPoUdMXyi{+IyEtodArh2N<0%jak*QJ&*0owh41^zYAjumHlZdRd7a)uI4JW-uZ%D0W6TR$Hjs;jrZ}wZiMZPW`_I#LMj--WAYDyp5KzMB& zQ+1L};b#M}-xla`ki7G6Ch#~fsNKl=rn+kj*wT;x2|i{8&kV>7RrCY`yS;-$QA+{v zuQ*SOOLrvLxl*buksShg|G=B83^AZ+(IhAwONH@X3InHI+QAZwtyLr%4;Fo+FfqOB zXt*0o3cHpEP#-$Nbp=ZUoPts3n%m>irN1f17%$xd2ZU?keuE0Qa?Ru_Rc;)*?ere8 z>8SwcPk*vrkbMZB?e&~**%N|FdR)(+M2dj(#nGWg3TB+mc(BO2Z3??;#&)hf^%(3O z+5YmnO$;wIj zoHj8h)zab5g|6Tc-Hht{;4Qr0C+<{vi1AKHLAx4Sr<5P6}6x17Ds?CTY+3z|t(*b8Yt( zaoH2A|MvYz1diI?)G57_$j=9D_s*t$!aT$t@T+LqL+9xf$H7J?IOcP1CO=UBe_nCI zcT0E}2|pO&ts#601m~OJ&k|fjg4acGWC*?ivAZSqV8l*==+6?pE~0Nh+`))^mdFi= z?^)uV#ip}oY~(D^GhcmbX9SW!qL0YvCpa1=hgYPp3rUo;+$cKFM>2hXYq)eyg$LdkGXwfKAICFTim@&kbb8z^hT3pJ{Tb zQMbAS{6}gwu^0Utf)tkTuyV6!X^Ni=VgJpOl@)s(VE5k1?VL+qaQrqhC}e*D9^~u~ zdR%k}=k3qGnb|{u3*QO5TJ}l-7tUBtz}-afTbB=JTw>5gyH20tl~Oi9k1WVKA_Dek z)}M~1cXv?SPwPbrmhuKV$Y;*n8aa)dG}W6F2x#J-XZ9tve}b^TR9H?+)C*q;aJ;vg zdK6zR&DF~7u7{g|!en89ef`;_kOzcGKg`AyZtQ6%OzU*W*90j%M9 z{e_*vSJ)p8Rx5T2 z8e@oX5kK3iNfAhiBrnOaP}XbO@-3z1Qtqy0t#+PKP{$g7jfT^BpNQT$?oU%0IwD%|C1bt$6}6m?%ZuxjIl zY9t9f38D8xe~KCQqEwB*t4vVuVzL+7qs=I}Fn$(&^Za{@hcBzVs(j&7B{}1(d*3CV+uThpShkXi=6+|f6)Aa zJ3#}d)TG>MXHavUdv~%=2KQpM7+auD?HWWUpw5%^7b$-?Q9|rZ{C#k*3VhYM2 zr13RruEYR{nBCeurptp$KeGQQ_GKH9k?hGYIy;Bd@#os@rCmhs&uBhBZ$AlXmAN^7rhgIw!Ct5D%GdIIwag zi{b*hLFsF6)Ip>=9oI>;u^v&=i|RTwjs^9)A2Tl-K$!ePzdEpuAeW$I`F#BeKzPLo z-!0)`B>Z56w}$X35S(vYlX{nycZ6p^#w4E1 z)rC_hw(8H!io=NO;vORQ{g~X&-{rocX>2gayNZ*49HA)45THyGBHdx&+L z8xLb=dy~nV2*-SfBiT?-xc~TBQQEBic)D!vm1;6StgGl)$lee4v8*#nCMo)_kebu) zTJcyFNWbW2lNulc!xQJnT5pL1-`YDtk9*%>gQP1-&JQOMrP85O{Mx2aZ&v-vfki{; z{Bxx^QQH8pKD(U|Lf3#@iyoW2AiRk9Dc*X2LP!OUf5|y2!EFb{)qgQH3Ce>#&O8q9 zM{Z-mD_m_O*UuyJ^>VFx0Y!+a=x>@V;{_7%*D7KEJ6ZpBGQ@nQV?QkVP6e*>e&1NwBa8!oRi=H zUSf%lXgtFP=NgMbrv;(DBP2PTv)ir>AgXTEwWr7BRZ+hzN!8s0ql`I4AUiJfpKtOc%q9T+&dWB^drC&3fb&L!s}I7q|@KjBgy~j z&ykDvntX0h>B~MI@*NDDetYagU6VeD6;oeeWi*E_%2K<@y^~mxG(55VSQG^J-|k@! z&O@AKy(xCfqOi+z7k`T2hhcY!R={EM9kK4Y*JHISvLN!5pKJCM*{2y4?H;X|j}TsQ z!got}7zsZZ;jJNj3Iyky;Lj3VM1t2vaAXL+0kOL!_F%+Lf#}Z?y)L3}K-|HIe3r-! zi0@hAofYK^ll4S9kh`-jz2OTr7_L5oCjdSe86+|?=OFoPOLHO;3n4Oj2VJK6h>Y>4W71&60bWtZ+f+r;` zlf4|#WG_=^Vde#YAkaFO;v|s{wSOKqiRpa+3)tyiAi8PL7lxz;YX^dW-7}s?Le`KO z`6mbKPKN{g;d^W<_xwPm_L`EF*D8|hbTpzh?lLHn!9KfD1Oerj8#*c#TZplmi@+k; zU)c47-D%G)FEC{!CjDa%H9Da96Kd4Bg1*V(FFd6&hilZF61QSW+;Oxv)8NMxFD>* zbUW*#pqL1i*OUD*UTwUDz9+I9O&kbo*>?bjWST?(w?2^=Fo<%o*>mluaw2M`D(Z*hQ z7(RHIl!fKDCV9b%%7AyMuV6TffXR}A0u?3MEk%~G+n55_l-tRmsq>pT8?o`{Y)q6Qbmm=b14@gz+ zolTjw`a9F%m767hNTV~$b%$bTz$vASc-~4e5Xt>v=8NfIefhaW@oOB4fIc%lZ;eYI zCOoQCQi)N93rYSfL(;*pu7lEXdCmp;7ycPczpIG9pAu5IL*Wet8%o9win+m#I}s5+FsF$ij~uU$V8sSQKgM~dXNh4Jt* zN$vcbWUqO>?VVp=YY;aP(SZ7hyGW!WvZqqHq`um1p81&lSbf8{Jmv3K)=1AECPX4< z1`y789=Y0+4dg!f_;T^7=CK^{VU(f4SXUb3}(stfAcf;f{MqQs*@0Hp^ zdd({t<{Y+65?_cVJvSU9T@F()ydX9}>ht=z`_QN!(;bC3gDzGeH2FFY7Tr3qsHz>M z-^WEDGQj*QVvP;(h&Y}cuig*8Co49Z`Un7r{O2Vil932gtO25X_dNFG2Kh&bcSByw z+bY~(zKwXcf8J)x?U-GB5p<0$FYV(9dP zZ8u2EkXsbo=?rU&Oxl{Y4I!6-sUz)86)VLq63WG^(Uy?S)hB%zak;(C)gux)$fOIq+vFd7abDS z>Hp^yCw#Yrhmr7u5#AcYr$BJN3H~g>MI?A#1V@J88xXr&Vh={_6o~#T(d#1m2E-kV z$Y+V%fcTyz-q|0=PqlcGIo8x49v_Qurw5vgS4++1IAED)i}zp(6`Y_tav-wpGZquS zoR+`>k=>%5*<-XKu)5LtO{(Jo`0B! zB=OcA#hR5V(xCJT&z?PWKvp&O0~ISL=w?GKZmjPE2M>PmKX~jnHf4M8qQznlR`Eys zz<1AW?C5%CHrSoP3cvhy*?)!|ZMSp={w!i>%Yg+yzkC+-?mnjj`xU71iR;2=xZZ}q zq`UBR@iA%iO5<^K`2_`TDF1nT+9?!{vcE%5oZpQhgsdAMQ^rrv1(1|Fuy6vdUqOO3iy@o^wN@g%(pV6LZ@@}#+OKPU*OHo&DjFPjMv5^)Bh$GSE~2fq>U3K zwF*_&D(=*GtLmGSud6}1bc2kbl;Amj&5smS=+5d(As2RL_`(6eH^(J=;-h*QfQ392A2v{pGl;mQIvIzbm}|G7pA zKcm}qh4F?G?$*d(5l7|}BgKyoN$(y+jRq-x8p_{++effJ97YPb?0_;p7Jm?>Q{l6{ zQI`qj`k})T`F%y_qS%c~gwVX5@`PtlY0aC{W0Zn~+@uKyErezNf0afT0OFzCNwRG%X)asI68`6UK3!afxHaF$}p zzpwpc#jhhjc*VCwzt&=V;)RY^_SPWAT_(nzyn=A}FgJ}+2qT#2cYM*1qyX*z&F^Z~ zea7^!Mpf}yK1a68;(UkRv|)@7U8uB*-XiL02PlN-I6!1Tx#ecaejsqM+S*$A0Fals zz(r+G13G6{5Qw z$achylJTUbSuI9*#R=an;bA2FV1&1Z@F@_SZ-PHda1jY!7r~Js_y)x8me_+4I|ZUY zOZ2*kz5#IuBl1}yHz2-eiFfw;67w7T6%0vPW`6c!_7&pqsH?L_x)bY6sJM0GH42T& zqm-_G76T3lEzkLv9}p?iqtdHPkC3zXlmz1B^zK#L{Qy-MZ&BbuP*t*}1k{ejD!>9gu)kt8GlNdkt zM-%q_AY7IP8D_(2PQR5^DGkmpD3doSwzi&U40`jv%=KR_(G1Fh? zn!apRllyCB`_49GVnIFA=gZ+Cnh z=2nEBcT3U$gOVH%qf2ywz=-x-eB30)n9^*JzUL>>{{APAxXo)sEAPf%r)UlsKIr(k zri>TZMcE{Z)a(Uwt74rq*DA34-WN{yKd(S+J2^TGbYij5eQ5OF#B|K;i58x?eh^&o zz1S!)#|V|cL#L2nHNdCINxI_Sjk)}lIr+0{453ozz!D_iVujY%+4ue^Lmo%C=81aA z0E+^S58mVS;A8$k>!Lg@$iGA<$%#vYa|}{zR3Vkv9qagAM)Va@FyXmnb|DJ6+Iwoe zrQaJ1b8Q<{R?|g>tl?+hD|>OT)K~H`7C(?k^Y-uOKG$Q^iJJAI7l)A^o6KjKT_QM# zlhMf`GKH-D)_na#7X_5B>i4b4Py^HxwZCX zT`*pMw{`l}0rctcFfK-|gEmxpmUFY+0eP=K^6+blp?iO%Gq=5!MTJ%}AMX>41&K## zn!DS0(fBL(PVZ0%!_>;xl{c)P)SKS?eEy_Y7oy1cYuMBHHNxB7SA$QCVHXzL&V33c zLC@`EHjYaq_~S%X!+`#Q)hkP5RAZ4a$?~^MW<3A!O?2UM4oMzjuGa+1Da)%D%(3f?EQ{&@&3(Sqn*Bv<6=CMiLN?q&qV zb))6?x$0$1l)by1Yn&O^sk+^xmGBSA(GbeejH<&fb7$$SasI9Ed|k=U!BB!_A^mn< zqc|elT~h}qRY2Btj{V^enlRz733_{fcOqGOwG-SQHGxXSwv__G8Tk>GU^92tUdKtavd)JNVKZ;U+#JkxL4~5K z2kV9u{5E@+Ple|VRvA1Pwq!PeShjzD%fr!&JrNq}y{gFr?8Z_;owHcrO83FtKE_GX zF?$*>d!K6TQ-2*ViMI^1Xd1tK^kDzp~)hWImO9-WJ09Cao8gP*J?AujOh8A?YpdbcVU*ZmQDiV?6Bd)e8{C)3efO3O3`S#2IHS$ zcphQcius;38O{l>M?@>q{SA2xK!qybgWx;*ptJ+2;Pw8Fp&9gg`+m-0krn(*qrH8| z&SSclb7PvoY3OMKx3Lunu<&n;I66<>w=;VjVV}oZdM{!^4xI?37ZP1PqX4wc#JG-k z@W7LQElkh1F#@jYPw&JWqOoF+b@kcI0!$_0-1_(D#h9v*tlI)(5uzDE9rgU?&-(wP z>Ae4He&0CW(%xI8z0oA&yt_}OLDEh`w1h`@UA}WRdsPQiLJv?|aEtQOwOCG1>dUhTu{&v7=xDUQaP!y#rPn8hWe(90 z(-Pb2>Td)1*SCA1veXoQv^u7<$BvquPv^!;yz&Y8FBrt;PW=PJIu184jqSnyJ>LDW zXOR(W|1~qhy>bYMs(q23oD$8pZ!5SQHj&QI`56hyd)U{7OJQu& za@3>7T(I$Y#?(QXqZkyeUB0cK4}M4pJ{XGZL>0}^NJYMu=gp)f z9KV}h%vm7;lL|Tf?z6~2cH%2 z4`rd8k6d>tkVarN?6O>cn2{*>Q6CGg7A79F%uHg~ElxZfP*n5ua|x*0ANb0Jy#a;$ z2G_=J5HJ-H#f0<3XDD%AGc_l34mlnUN4sC2!6v^ly(#Jah#LLgl77ofBacmsJ5Tm` zV$A>gj#zX4Ko1&t7{cb&;bWbykJlH4v3-T1%KyD*ISrXS^eJ<;M(qdF`6HD;^4ihz zcmX2_qMDY=mejGLo98$Vo%w_^-DjO^YAV10nQheZcqjOfZ1#oeiwK5$#c|&)?qS6J zV7Rvi_bK3CY&9bENzq9xT*2JB>f1NZLBL~fFeW}@n3^m9e`^Q%q-Y^i=c zTMLSZO53!~V-a@nv3{FwLHa#d^y*m?H-8v}(Wb|Wf}dhm205zjo%z^z*GrZoN|ZUu zG2+bd;5HPmzx@4%B7yuq?|xH`dm5~;-&_&9d;=1bUtPURI7-%k`098}#bX%TYdG4~ z@(@0=44Jz`tx2}uee|tDT@uvIc%SgdEE3keBzrhI5XpkJ&0{04y0Wti@+0n?~{>lRjOK;wp!H?)sPV-srqju$G&Q1JDHun0{iB5UN)C$-Pb z;C|q`_ zzdU{pb_G3d{8U+qO0&tmjy@@%@XyV+Q;PM#z45-DR{*6ub&o7`HmXys1iO{s`FnR*}VZ8Mz!rZMn1Tt3(#??${cySZBOEepdW zbh8VAC#)aZWBS6?ib&H_vo>%i0h3f+V9R&=g?1!j<@Thupewm4Ji)Ybkdr&wn0N0W zL11g#St?u-mI<_7ToYg;=U^on)dh>_yYO7i^}JK?LD|05YX>AChrL{{`V|>$;wzi9 zy*wuwRB8PD@bC|srpnVG#Rb6whWjoQ zo=)_y^*9N07%9C)GP-ID2ccZ9jmRIgsBwe&=eRu<#pWhqov#PK1nsqI%3~up=P*Y- zkhQ`RFZMbAcLzHg+1~WuezvoZ$&&s6fTsFFb>;kr;P8O~I?zVhpFdHNXVjer{|0{@ zjbp3=KEgt@iYu1b(xE*+MpTc(5Wxuldm=iJyP}+Ubheh#w+|HQnCO%Tbr2<|dWuFpbr1Q?Npo~>G7hs{-uo>bg=0!aT`Y@B z{N!%h^b()(7>ulxUOE8JW6qC;y`{+1#JV&cdd6MOn6rG4WD?&Kq$n8^X;@o}o=g4l zzLg`3N%Y@U*;^-xvF4P#DiWrLeV;N3v&=PURFF^6b!!0~>0agKv(v&Dt(Q`y$j30l z&-y~6Kn}7j*{yNks6|?_|6a}#)=<2t&QHhpl=sMtu3+X1Q%q%$kUg}Q3BFHUFtw{l zKsU#)UhzC}6TDzJH+*xd9NnDiiU_#+ne?I4p~m;jIzjP_8(T&*57z4>aZjgni=Z*R zJt@Lcg8HcL{5+kSfkdo!Z5OFcpoOEq#rf;CVehAR+Qa+*k*1c-NMGI@#l-2Q>~s=V z2#)^_2+%}%q2Yg;0avvnkeB?$B7eG2lt%p1@zL@F%KoZ*bg_H!wOPBQT^IQt<&V=ONUWl20p zNVpXagx@xZl@*qOdn>5^#~~KXQF?SM<19C-1Q`CuiFXM{2D2 z>~|m9UN5M{oF0CU)d7x*7!-=`xd0t_MO`F`6i&#S;h{#O!`P>&ikbA>V$|mQUNweV z4cjf6`)`{~3xlbL_nj-~MCDC;a;S@sK>Ovof<|3=m>AhQ^v$y!{R*U3_P%6{yG$rs|L$O`C)af#>s|_1Sn9O8o&*-J*s5Y-$^u=`tlEEbmQ^A7%hxkCX#SCc z$B4c%T>5>rVTP9<3K9);0^FprJQM2ke)pnLH)p=keeeu$zIq|?BQX}?UUA%ai+dPx zKN#+ui=NsqG;#@?W*M)OraJ~WFyT$upcuxV(&*Hf*Ja2&C!SMYozBj<%v-q4{ zu^UNpjpzn1l?3Ngerdpu8v4)WSQOyEzuyV#Uyfm!Kl0e#F?>XIXDvC^Ick7RXA)v+tru=+;1Lzo$lIep+eUb1Vb`Lf+NN^fP9 zx9NAU2)SHZFJw0>GuduZg42t3A8~luhe^;;i)fScFtK+;p7=cD3`>okI=MjB{++@J z3ZG=RqA#_hE-~=I7T4!ORq~boB;P9PJQ&klw9Bzwm)IUTl~(4mpUj`Mbo`%HE?iRf z(Wl`!MpW^&stjA0Leh1&^sAGf!0%3{2e#W}i0p@-TBlyAPwzLIqCT2FBne_{WO(4^Mgd)MW<;WpIM>LnvTZ~oFS58~{I63*C+RKmaa^w<& z;af&dZ?PQvjVd9h$NsmKI9XM&Yq9cC zHnvl8NQx-AkDNHO`17sOKhoKR2EmpqY%p{%raM)21iYs0?mZyg4Hi6e4PA_~(XGgv z(G{s@VECA}(s|W`*b?m*o%A1!M6A^AwY>Biu<2*uNsezoSMQTA&e+?*Ohsj{L!UJ< zjXwr7A3pOCsW<)Rzw*)$j~72YM7!3891^ho-fv5e zziIe_Zh6}kZ=m~tma$-O%V0I4VxZENnlu4_qgoOZyR@(l>B5_e&jHNJ)Qebi(Zt#g zGneQ@t)ij0y@ZjxugK2Psa&ME8{IP!aL@t8K=)|sqmtBR!kFF!w`9#hjFU9@fH#1` ztxIgGbu!?9;}RAwp2s>s&5-&vVqzZX2)iUf6ix)^a(OdFUSxowwwR6;QY<>Tk$U0u zx&%4M^6l<;XrdS+29Wcdr1KBF<(w17G&h7&VyTKw|1Q(^&l2;DreiAW?(688_Jt3 zkI_ZmAH7cD92+M++%Ey-Df`mKkfWFe_RS}3_X>Y4vP`>D@WZDOz;gvX1N)OfC$TN_ zljU7h&93C#{UruW%Kpd|x^|d6SlqIZM>CBSP08ayg6ib=FWat8MTnCR)NAM@v?vhE zN?s05Rbj*ztbR>bejg`(`8Gd#m%)-~a_w~BA6W`pw_^Hb!)c88?7<))-LW7ZoRSSQ zFiOYbawlt?DEMM{CcLU5(u8QFrs^=pm4>bVn$16=LnPjDS*VR|IZB-QcOoV|JpmK0 z>LncHS0ajYc|~t0D?uLOb!VB+(%8WKpP=_=K7h;5vzktA-T8;6 z1e!{x)b?{LU|OGIR)uTsp?wDD+0)HB(648E%Wv#sATyP?**lFLgq@5crKEZycB|?1 z&+w*o#Cq~+85bp&UP}8+ymNgEaXvZ8_^43|v!A67d(Eu^Qxx0>{U+65!SHhv-li@v zo!V<>vEYN+-~DrPDXvHNzREQn((DDy=@r9k+oCX@NzkVxKoH{;W9!Ibkiy;(dgOx6 z&4F{CZCP0V6fm8V__xTM4UEtDJDW3p2ZzV*CbowtVvV#Pg|CVTL&usp*}oacDd z{^rMO7+2in*WEuW(RgG=i~D*75-wY>Bfn2Yj1Mr2mIL;{i>YuhD(M-LoS4qeHDo7J z{o-00a^fMXR4C=#_`HZ-fy4SS7&q~H)U9oYo*`7>6s5rTPY14r>zHWhyhclk$3OdP zj-#E3qbZ9VCQzk`;jotN7c`t@ZRYmqIm#G*S8M&t4Nm_Y5e|5^h$ig0wT{^A!32kc zOA8OI5jGOGp6XEY23cI%>-v;CSpKxir8>%f_QADY4Pt-}R&dWYqOMI6E*wXqfs{E* z#dhn+g@j^_&K2p!ro;t6l^hjt>_cF%l1q{EnyFK=)vuKwU%5kodnK-+*B_l)NC3bHQMjCUMQ;2I%xJy2XaKu&z7D3hZ?_xM({ppLG|3KpNn-qA*K!!_sM5DKtyowDTgRU ztjSV&tVw+zbS+_t4p84B`H6Cl*9Go}%)c%bUNQQNvh|BwOlnHd@!x{Nz1+8vZGLxt zC|?2!7`Q~vlDh(S|E-p7tCJ!MJ}?Z>_J51MI|NTf2J9xPDG7x)sL&E2$9Er{n=Hgk zzK5II4+P0t+ou)Mo=KCRaJ}G3w$>-#43V_Yx=(`FmpcuO*p3tX*9aNX#(HGFEjI_N zohoQUEMK}5d7Nk!!Bu)GPK$iJ+54YiVZ@zv+-U6g(ZmR)LL`_h*WyW~=Q zZLm;){5DLf{yXa&V(*@FPPw=aW^W%ork|q@jqEI_53H(TqH-}m`J^S_q9Cnw)uCp< z#x{9CX;&VC`gg5w?*9zZN7p3Y_jICv-4OzPPP&jluJUAh^sYG)WE`IlX;V-EQwbu_` zmOjl64NB>+D^Gku17&cWBQXg~kL)m1IXk28)yGwx4W1z*&!x2!)X#vOg-}Jfs|1lG zrdB=Zt4{O^;+;1q9U$`VnQn6KWFUU_^SZ{^IE*TSSoRJCYhdg4dZXIREhyCaH=_bA zBiT|muE!e;&_y_e}WObjd)evJp$`k3_)ZJUFsL;#qupPY&g(s2o zV>6(i^j@8tkygA7Ef}v+*9VRU}lsRjj%r$gG<2RrUI!+hfrigWTo|M?Br^dW57Y$cj;ex9l`_t5)WFu0?al?4C zda%lJN95tT41jyZao;WOVZ{AlxVHxPDd3!MoIi_m5piA@&XK|S26*ol?}Onz1w22C z=eqE`0e%O=_p|uk0DsTobM_i%*QC_FC6H=!^&YjC23)k~IW2s4KPKHSwKn#U6N-x) zYVp}u0j>ZmmsZbORPVDKnz~I!a(A5M&q>vT*(kZlk@hI)n=BRTbKDBbPWUW_9FBt% z>&{|PCj+5ZnVuoMo`QY$_1|E*_8fb$_hY)st%umCk#BP#F$A)+FWsU)%}@R=?e=fO zF%A~T-ru0n~z+fNBhpgJr>q1zUn??77pdA<*SwU3y>!nM!L(lqM*N{ zK;$s>MR?=b$ywnjell0;k#@s$S6FM+@q3Lb3(;)l6k6^SgLiWG7*L%Il=Ipg*=ZAp zr95WEo8f;6;uGs^wkc%PsOUEfIEO)I^UY!yrVYWY=$|jbTCn)ey~Q-^Jy>GHhh&Sr zAJD6gzz&k@6lfYb(#a?*0#6IBI!t{tgVfv^E-@$e!tSV6wFZp`NSb$CSK(p+SmZxq z($SRy6uY}mJpM6H617eio$4ARwbTg9+ZyA#E%DaYaJcT~k<} z27^wYtZ`ZeHz4^_PobK#7(Bo8%B-o073!t^I#%TJmjKnm7iadrAlVnS%Zah=!e-5- z9~w&2z=K!%Oh;WVW5S2}Ug!N$$GjQWPi##2Vaz!!-83F&F|UYnha0uw@WIaCsnU`J zNabX>Kz3Kau2+UX?R)42)u|Y=o1bYC1G^>6>)u|*{?yI220w9w0{veGJ!mzFSt-ua z;Y2^|vrA~pmVYpOSSu!YG=@OD#;X3>#3~G{{&Q?CCifhCGJjxRi)kP6%-g+b%1jrq zJ?d>ePFHG?;*-lR|E@npu;3&=f6WrfB`+}PcJCO0al|O1d3A$utS7m-?&l~fYsrYc zEnkmBf5-Fxah8VNqJ99$G}ep+grjWGG|?)|9J=*pEw@9qMlHw1T8NF2npSP!yL zwMjvHrU9O*vvSw~u<_h3izkhJxNCx9<|-*35_FOU)*IuN+7%#s2H-;bRMbI0Pf{On^s zxMA!`ez9vV=drnhoz~&7cX&x*9vH;2*40Mw;!YKP$;92*F>smbz##VLjKs?3{ z6Pq9MR?U1RO$5l@2+q$y^FPEB-9#=?dcX;JI@SXym zpT%=sc-{cNgW>yGd~bliXYn~JeIk{?SF;Mh=XM>dRBzEGx{jI+u}EZYu%@4_xkd6i zeNl#Q4=22eA}m!~#o%`bg)IBzvDlpB^o(J;9p*Q{_H0@9Hm0iR`M_Qz z1iKx(Suy944)ZGRuI|^(g9c2bkkRuQFlVf{{t-(&rcqa{F?xCeaUZliZ5Mn4qq%j- zIA<*uQ%vB|JEb;>XktzlT+oTb=w+;kf3>`AE9pLo$npBo?618e+UHjIAs%x()X- zVxL7iJT>kGps(L(3LW0|peW13r((PhV<$IAkJ);)v6@h8+V1Kl(uSYs)W&u=68PyV z|9xke(ii)X%*CULxhYrzJ2P{vv!cs$YxgRlLYGRXvE(3F^rf}+2z3N=a_ijB($bO0 z@PQ)NPt;i8bfb2?@FGEUm8(6Wl@?=uscLjVcb#DQVwY74{XMic7bos}AsH2g1X2Hd z+lr!(Ugfbil!jY`cOn0zWw06NrJg2%{TQS9USeOw6iF|!ES6tQ73B|}{ZO!S9)-^B zr&soLMy8ef69O3XkgrF+omdYQ#>-vYFBYy6^bKU61sh2NUBnec(zRD=1yX+7F9t;7r&nphjk@5b+)Z*Pvs-SRiYl0QObt$%Uj$3G#2N*9kiU9?;X1d2Y8> z1sJLfe4xRW1-$>{a+{fq5?+7@?#+(0t-dwU)aGy!@V1)iA#wOzhYmy1&p4B<{2~Se?|yTu(9Gb8Az; zPT$Xr*izJj=4?FI{0_fHd)=kWsjfZ-cVZ8G2}&qLe@=Y#=wTTnoEqG?y5}Dac4y3o zo_}hTbos%vM^C@45JtXS_tY&xXl8gNl(Y36@X(d!?ggF5{fUMxxAk6%2lk-;0Fx5N ztoqaw9NR4aKv6>@l9v}P;VZpce>@2V!#u#Ht@10^g3PShxx_x7oz%JG9y3z4X1MB8-60Om? zfnEevnNcq&W3JH#cM48$L;8IO9GwH+gX)`eY?+BCFtXW|L{w`F3O(WuhW~Lyb7Q90 z8>07MuTART*E#%uuQINt#8-Qs;P zyr+QYXYpJYo;SenVEBF(-y7iXS$xi}s?fRLR?MZuWnOzFYx#nv{axp#qDM%vV+EX( zyo}hR3>D>pn?53!zh$u$)(Ep~v!;?@VyNFjeUjpXmD*GK^GlN&`1Q^CmwV*i9 z2^E&x=deICDi2)&U#zCFlUG8+2|M0yA9HN-98Bx#a&pz9CO#?OEmz*)gH6&YpE=ij z2GR=*$$TB5B5GWEOhrxb#Vj3Rm-PX2xF#>)%xOtQ+|Y=RS&;i67xg)rbZBg=xB`T6B-c+g@jAxvQx%xK_y zjemV{UO7N)>Pq0%*#}6Kcwp~~5(*hW@AV$HN8BogoBo2(b{fP<&C&E$A2;-?*#@)q|8 zvXN{A?HNgoYUpEi^YwkC+msIQo?SE;54~6ac-0i)SYKn`+ixbIki*Q@f^;19U*rvM z(xV3E66@l2$Qgx7TvxFY+ahf1_%;RpM1)rTuf_o^h#+cnO@qg?k|Z6u#@SZ>jey-! z@0q0*z$9(s&y^%leqWlsJ~~AfIiCBL)lyu9D z(ZdmoTVY^|;h-YRP9FHO`Oz2Z%MtYNW$9tj>7VR+cb1S#2NS16vn8B~r?R{DDi%!Y zY>*z>IHN$OoGa{axUo|?1zG#o=`d&geWLe1(!&{=Qr)Kf9`6zw~mZ zZnnt7fvpk#cP=4FeMGt^Siluj-@Zy+CFq6x4D_bMSyoA9gT9Z(Vm=dCrl~LVluVH> z2~0`m+AR{wtvOZ0q~g)O%JT^lJh5OH2idpP z2E{TK2*!i@HggUKFbSQb^%+IYKxD;`SxScrw%&WAH$lmDPk+;Uunv8}#j0~OYK?yh z(;Dtq&QA9bGU|HZ)Am)85Hs~K+u1y{q@*lG$s2&fIVZ9MB51H)p>FNvmsA+P7$3vM zCSI7r8dUh-dv@;jwB&ztR`4FjgkFjXvfm@(ZradHnlDH$I9@eRk`+FYba7#XidlK=b}rTcEuP9Q_g~xaZ|0sPMg>bJ(Q^Kt1z0 z>4A0R^K8Ec^8Mj=Pq4rC3pFK1EvyN)fw7%SGWkZ2 zF`+ZRSV~!YP^$bX+N6_87}Z_z^Ytq&1ix~Z#y^EY6fdugf86#HwDtgy_F&i0$JqI< z{AN>3WHR8Js;hzL;bs4Fzx0Xq|j|_F}ysJs( zBE~wEhAV_*mQ3#>OQERqQtgbW?IZLyLhYviRSy(-a`NuhsuoDU>XPC33ZZkDi$My{)k-cipK7rGOPt==K zx9h|qR+rIk;#h(*%zKy8IkS+iiNXL!2m<-Lvwphti(9vna53byZNes;4wkJ$M_k@A}>zFK#(D|T{MrJycm9eQT`-}7wb9jDnAcK}=Lat8x@RX_Vt_Oswg>8P-Y zzAU!8_0Vsx2w}MY&C^(+s;fUPm_+V2iZGbGieG zvplY9WQaA_c_$($Jf7*%$spVNF3_p?+{}?( z%tR&W*p5AXo1{p)?3kCoIG}e~9oGewYRY{+bWC|Y4@l}g_f%mjKvrQwS1lyO;al5| z&{uL8q;a?C-W4tP(}{|A%+PfoL0iLojU<^bE4Tgh7wM7RY2_G=*I>ox(IEX zj@OkMbl~iz#QC5~P#+YzDx&%vIcCV%Xv@;WV#}pn*5?}t**_1{{(Hp)w>x*FjjjsA zt_xp(v+Wdtft_QFHcm;1b@-4A)rJ|mGeKWNKi~&`8Pyx<>N%k{)~7V)uLOvU1ac0$ zM;Dk&SR7(^=OxA~iFls-LPO*>WDYBI;wO_TE}a+@m8A5m`5!c0ktFAAhWB5y79y8< zWLt|eBtz1B7Sf|R1@goBSGH3tyUC8+r}&efCqcT3`a`W!isXWQ#5x8=ezJ@27<04* z#qSam$oBWCJUNs3Zpf1rX7ZP0mc$FKq0p^Q-7xP4CwW4SefI88Y3yiFcHZQ_Rl>4% zPTVRT8BC?N8M8Q)fD}7Hy~X@2z-{VYuy9KiiyzAv_oGB#>i2kk?YcMxEQ*M)t~~q( ziqD%X{(bizq-~k5nC@s`(XXWqndvd?Mz&n(+x2BI|GEF-*@-bQy{Q;EQN&wD?5*=mEO@`zuK%Eg9`R zP#vau!v%-f%wht>CJ2qsJWE41)Zw`2yo$z`UUcxy;Y*jD%h2OX>K;Y|?~%x=+av*c zg4TRqeuv|u1nziRTYX_3>^is349zGTMl17gS83lg!8}?yjBZa6fNx8c!nR{UQQym} zf(QM9$cGOfthGZy_aiH=e+@Qh?b0q=l~^-;iPUdj%@yQfG=iet?5_*Tr%ch|szw;>y3KX3$t4C#=la z4Wc#v-n(O_56vU{600r_0n4~%O_Y0sH<%wL0hn^3dnLIW ziKjMHsA^P#cxg+W)u(?!=ea5~wY+(Rd&P0zE$(5&{b0DaM&|!M1)TGZ^Jj4`BF^i= zIWjok0Po%6eK5SIfahoNTo;}R-&ifM9xaG-hvNeJiW{wS~dUg@bPyMEa*qPk7(_uyLX!}+|yI^cU= z>#jN-C8ynd|L%KJPMF1nY8e%h2U#VfjgD8!Qwv?`sX`4wZgSA1((m%VAQc z@!gey^o2c{sH69(LNZ&?{g2TfECf;_|`BMu7|8QPDO))+=Jhow{ zZ+8~4?XK$&QGX7uE#C+}`q&>_UAidf=B$lihuZP&o9BQJ6RG6_iHkgnK2F*5 zFz?s`7!I9p(!Rk>{CxkfRWUgph85gA1Fo(BPeJ)wMFTnPovv5oT%Rm;4B*Xi2-D9ANS_I-frl# z-&6L{#1!c-ea@d{^Zk(D;M>qDg%4$6$s_-jBMO}7d=m6CJrDJMwiAy{Ed@cWirQaO zRbij$m8F49af~_Jwz9CL_$%qTp5?<3V_NQ%J?DzZi^I>}s5lFGekTtA!8FWuQ{E{pNT7io)&)+u6od zKI}}=TIKz!ae}CSQPF?zS>ct=?QY7P4Nz3%kbIm3WbbO-y#D+uD34{@mfqnZ6F#S3 z+;>x&-1XbLw=GGMT*J?K_)u&Y(tJ*hySoYKb3Wco_BDl85_}%>J9e;wo?Fw8c^KW7so$qn*Nmz; zuF<}mt^+5}zx8L77KHb`3tLL;STLotB5_^5`{19X>8nF>HxPf|&4?2xF?4F=Osbvv zW3+no(zeX86cGEX>#qBA4anN+dUsuxazA+^Gi4BJ3U7T)RBEKa&#ntzthTwIjmF+} zR~95^Bf|`GQ-|^su#gzkxjD}Omqzx!KODA58Xdh^(K6Xb_*MSt@L&^#XLUKUacnIK zJ=c`(eYYBogj7~K!*g^HpLp6qzrPpIAsU%CjkM=M_AjqrL!Vg5tpY`_csahK*%TK} z88Hs>@w)J@Va3~M(Dt5z(_bNCkq%cK?>}iGE&D@S_p`f*kE~`^e&mXf9d}exK3~d( zf_7x4B=z0IyNdJ+e<}Ph3qyXn4(eQJ$6zAJJ|#$eby#@sb2UD4PieyA0`-^h!s54} zNfSQeyB>Mr!jmhg;M7YKy^utx?o-|&pS1*bvbTCfgJiKb?tAXk`Erj32b z^u4XqMgr5KeLlST-3&%ty46#XQ3WRVG`7!_$DsB*Tq~^7WkBA!jB3NT90jLj7z@|b z6N0oa(de78L6@ovT16@}uuR)`-rtpSCoLUU33Np z*j2!|OOQ~9`c>^pW*X(OjPNF}i{boGqMtU?LFOq+I%!<4?~;tD)Yr(4{RLo2>9%e5 zyOY@VhlGAMV+~l-{OL*9^-^SgXg49IJrooz1`BR~B7tuM-sY+lUZucEWoWWb1d9*7 zqWfI0j^Nb#$hw5GH!$@{J@emtmQ(y77E7743^u9-A!hfHN2-pvPy;8ieX2kIE~6On zple0tUk(8xcYI@5`=2pNu6^>%_;L@J)61<`#r6~}$IxQ7wK2L!_#Fqy# zKNxC$Z?|M@T&Azt<((0CUv!IHca5jb&SQ!ExyMdQ-(iGI$2)Ueu>3Q*H0h-B9oG-! zuw&Fzv+TO$y(l^$Tg(e9RwY0GFsw!PY2@4z+TtLezgDBMJKYSEOcUtNd(B7|)l7bS zNp2Yp@NRG1P2+){`!tr%7X3mk4BEeLTluvd(Y&b~HVNESf6E+}T{$ch#i_&WzA~4-oEjol5IzL^o zrU-6oJUDKSjdPPzKU%Noh)WYMECq4dPjX^xt!|g+)HVoik2UkJmQXyHeqX;o5Sl@* zI+=D;eg8mSKb>f#0fx149Q7&YkcMn$I2sgxia@OibJ4i6L6qk%a?|(c1n9*CBLYeW zfs(7Z&nK%cG#{_{@hICOi9IBM=VN;hK{r=3HNRtlFkG{etV7v*1b=p!xJg|K)J{e7 z+@5#^N^18$roEhvXn69pKl;#P<$H!hjAj3j=5*L;I*iIm>B6!>;h!0ysn-u1n)#1l zp`g&<$Llg=ks_avs#{CpTfL)*5t2~LljkL~mQTQ!ud6*yJn9vj-DR_SD`(#uOKTk4 zrqdy9T=%Q;yXRiL+hcQU|Iu~CrgJw{E;oRDq=5XVci;*6e7|WzEA9d3R9zLP3Y}JqQNo4muOX6zcjs- z3vmX}Cq+l39Ch+3?CAT~{qqB7X9hs5a{p^jP zSLANuhgK%zi^O>PkNaL7X(4HIt9~m%mSnHJOds!MmJ#QVOb%X?S|-Rmlk=s0lu35| z_T+y=eR(`p&;S2b2&pV7p;BaNLm}L8$C@?!RuQsi$-c|AuZcui6UtIaD9b(WP>O8X z$&#f~ku4Qj>UZ@1`2PGekH>h;oHKLIecso7zMikw9M(vzgZs@V8xGirgUh6!M|4~Y z5Pyi3c;UTuih}NsFB2M7lqlIdq9USxkaud~LGYb$xWD7q-ZIYXhfn&ti2MgyVM#JI4E&DwnjYURRL1^IiOa7W5(TFBN zP*vOFl(uzns8V*mhDd8mZ0cg`j@$8jQ6n3BI8oX4_nM3?UH z+l_>`lr_dM|D_&1x-!6DF-y!)U}8Pk`kUH7Uo}NrFhRAYD~Su7+5rZfWVQL`-&0?A zTW9FJaRs2_cPe9!COG(!^i7h%5C$x|Y!g2zj*<=-KYxEw89wg0cj$mLA9%3j{W)bE z0EIR$<$`cA$k~6!{lOa+G-YtSq^0!`Ic$&h%jI8T`}GMGNJR0g*(pWRY9F(=(S5&3NIl3|UnEW^9o z0+Y2CR@t)MIR|8~5nlOykNg2v7Ej3)doG19lzHG)rZDOZxuR@Q??!5S z_cJx*i7F{Sr0tKiNv*8-`=yg~$~LklynFwy?6yEXt|q#w?1d;dZ%IIYYdPZ0rJUcx z{RnL-9M~pF;{w@Pe(m|=m_xiYdB2t@Yly1H=;ag-J%_u$HQKJwFA;qM#jhIZ+(kb9 zc_Dm@bue2N|;FN;vjM zK-ZmcLA&QV`u(x);U1h<|Jn5;+r5zqIHx!iD~^=HfMV;ukOT_Mo8f-{!tW)rTYa2f zBeV_Blt>tNT|bDiojH`QSvpU=Y`XegG4~uO@#s~tZoS!%8W-izGi=d~cc;f%`B=dvBoPA;=^q}Kk{s9kSn&!K#ey0sc) zag7p?+C>vV1c+g&r|Eky5#~JIy-&Z68(r`ZB~dHJWV{OZPG3kKtC5^wpZUZ2x#slf z?>9FVBS=BlkFGXsyG5Nc;JduHhfLbSp)Hqfb&DD4N5 zqL-5uN}EKJvPTwGO$N~WuNrhs0Zpha@Q#x4>UjvZ{TUt>PXtqpUYB09SirhOm)I^X zP3ZKoY`0gqCuqKS^3&ILU0CRm;`l~Z0v;Vm)9#eJ0Msg~vJTA3z$@h8_tCc8uvwZ) zgambv5T57Bfv1)WLDQ`zFMPq!z^#SQV)3f zOM!}FhyfV5{q*@qjthW$rOn3tt{rR=l&AQ&C?LP3ksJpPL+HovG{mMX3YgG`$LHC_ zL8^l|Pw+tn*m#D>Hok)gKSz&0XxnK~*e54?i{{7$aDk?bp$lL~Q zzkLoN*MRBKM498TY>R&8pF@H$bF1#~SOO0!bKfbDu5<#6{JHp9AebAp)D~RcHN*pG zo7GPWnJ!VoTy3pR8WEt#`tzuP$N2dc#$e^6(0A&mHJ70J;3Jsk@xTL>t70I;b&>US z@o6A!$$n+eXK}19#bLW^Cmp15bzq=TTcv(k$KL&Bpo5EpqX$zTG?3JoQ=;pS)KKFX z&iwOdAMpQ&vm{G(le?eNF(}-rDl|D(`{MnrD;G+3qNQdLpCM8(azCB-=N#-X z@|dX!4^1F z>-#epl1Rf}6+rbR9@2IwPcv~bx3-h{!imWjS5cMILZ$q2DKeydxSMdU z2Jv*;tQSR;qjP4Q`}gRPA(wwmCzrt@arCTK+&aG<`bOlt`e=a)=^QFls$#^!6Rzzw zeq;Ah>+o*Hrh|qx%9*za29QoPL9T zg&M2(%d3O%eYvDxyA-jD`TdlH@82oM7A3y0_Le~&)w}g(6$Qv*<<#kyfi=k4PEzYs z6cyICC`j2DysORFe!y&X&wiBNIJd=UOE}6~V!jqY$U$Fd|4Lsi%c#{&dZ`!tc`_+uTeBNmIWW9DdDoXzG(RJDMw%<>q{tVZu^LI}@e@i9) z>U1mC`&mb+Wpb|2C|5y^sJ+lcQw4I-&mDnB1<}sAkZrUBudjB2S zrR*UIzWa*i2j;u>O}Jp?Bwkxb8Md_?PMC z485=gNY7gx)fZ5JcfvZcmB1w6SzW(uu@VSOtRK&L*`-3F`WpXVI#*>Vfmet7bIoGGv$mtkmV3#-)c*9g z!4VxaaWz|@_Q*MSyr4bisfGqrpAD6}A#fHIC3AKNJe()0+(>UypScL~xQp_l$E?BR zcQv^IB}v5mc#p=Fu%l4^%(IKhH;+LH>a?o~IEuK0BmAx6EU?cGf%E?I3!-TqK-vbv0NzT(eG^Zc20@^xRjy%&a_##Ep>U$v@9A{Ph zIh-U9qDNSz4oy};iX^xxOb%vw;M zAKfN$X4ebqnOd7mdL=8ho)0lIwx3h7%o+9!ZvFMdTt3>C*GE{fM#5^#9yKSF8%h31 zyRU=FJNbnDFa0CZ>HG7l!5qwB>)N9Z(0Kv7{MDoFIffrc#pxmF70ZttT#trazx{_q zZyKEcL!3x?{ZZfxP1-ezO@^psM&v7E3T;DJh&uQT61 z-0ba)8WiA(a~?4%@VTp+vq1wMN4^@lGL~`;)fKa@63)liD-1_uK*5nNdaT?KA6p?|jyYwD$TAY4$1b0PoU; z?~(T)hsUk*(c(gs@P?xfbU9*?TDzH2yUanwi6|Ce;9tA~{qaeP?`(8A-&gjWj-cK>YKCaLm-02QDpgkH&UKpjCB#-eH@4z-jO`&#l!I zs2VqacEx#ArgBIpHobYdW|mz`=rp?-^{}~s2bZ`RDY^CV9gdr_)L>2DYG#g6;!8I5 zR#ovCN`k-fe)kAtRLQjdCX&S%ep|lg!#^&G&hE|d>t@s;UK4K=4rB7C*en!FaH;we z?X|@YSy}dxZ1|pw*o1wgre~GKcB*ue!b0AE(m4Hyy2;OO@&=p!>?Y0(m`_<0}+yiny^8-c*kdCcuLoWddhzZ9xK+qDTAg*+JR{rV=?hM{tVqV=lwzYoIvquEL+6 zSE0337yWk}Kb=et2?t7_K_6nCr}w_`!O9O%?#Fv4f`w}zhArOS2EX#TC@B%1=)F&x z>J?`zC{!zwU@&h3bJ~X=ch}mY-$v9~bj}WYqsa|v&IE&|$2U*ba7BP`*A5?4%=AG6 zoN0-J))!%l=3k>9UoOMyZ{ZW@s4c2J-~Du8Bn;C(=pwba6@X*Uq6IH1>4Fat;rtYy zaBScW=e$|lc{HE-x9hpN5z2lWab|Eu9$NEBJ$!OU4=QaHRvlQ^M>SIa@w1P;|Kkm6 z{geOsv;BAe!`aSLk83%e+an^wXR+e&OceJcST01a3iWt89}9D*Q0MRcX5v4wKzw5| z`d%AY!fZ+JPl8{@NQAe9IqpDtu=bEY{{4fyu(vhOmE{HvB6QK^x29VIkz4N@5Bt1? zw{M4+eLi;_5j5>IXoW5TO7E6uWl}AW{~+ZQGsAm|fk$k73(GU2g+I8fA@A7h?Nj{I)zc%Xoc&oAwr3tIHFUmK~+`#JWg0+?yp4 z=I@=*G&u^$bM|$m=AM{d+qOHTm_}sby*FrA;z!bqT!GSee>O;{YoPy|8A+)ND9B9w zy#;)+eps^O=pkS!D;jlp(Fy)}R~mHoZ4b58V|RnT*as^8{y?Mhj|C(G!9Jh0^aF+9 z>zS>2;uvPJ*m_fGhdX%JcsDy>=ox%lFr>!5dmpqZRNT2k^dM34mF^8>xWiKL_WMtPB3l#QVNAazmKlOR3f=@ za9CU~7>mlWd{IXB0>bjlts}B^aQW%SvRs*F=+|6+<@2>JL^SNx>(sr1@ADTUg+hO9 z7tesyn3gUe85m}Ly;uonl1kK_V;>_q&MV$54k9l}D zfzmd}?6tIejhgG>)xO$6fPIsbMvFBcs1_qVslRR5!S!$9J2CnFSV@rmnpl7jLPNsD zU-G_C&9~G^@Z5Ra4<{Vpift!$CHNFzFMBAJ?|RQ|Y3(L@p1m9_-+UBWk&1?nSCP<8 zQ>_6_@(IXJSRMWvy#4>;V4M8xCU3Cm&u-$Zk#p1OB8LwsMdJ79y{KU{-53{iMAQ+! z{==6O>~;reIt%T%0 zRNh_{pFf9r?*lzRAPlHGmL-BS_YTV+*zJlcMMx4xuRLJA@dDw#rZ*B;9<|#ZX@@b1 z{D=&iz6m6RLq6Lvq=J}tJ4PN$1|ki8vZz^^4NQ(bV@-Z^1y;|J)g1kt5kYR9OB&O~ z2!smb{VEp$VXUn8x4REG|FCPg!}%6U6Xv?}t;r3VS4pm4(({K5{Rd6d?gyhjlk{g* z#Bl8N83~o9?sHhKtL;pygcO9yL&cPPV z_@x?eYk2VVu*gW&kpB-+Q{D-rxUxhn{C7&Wyb=$yz zrh0Uu?f&?ga&~-w3O{N5;RCzK-7M;fDX{WebW4oME=a&P?0l~R&{;Tjm8PZ~_QgN@ z)UGdu+Y@qva?W7s`;S@X*1dtZW+M0uS1)YzGi1c;VmEDgLapmMAi?15K~-*Lj9_oE zZ^4BXdzJkr&;MNrGB)8+ORr>w+gjYWhkX>>xK8jws0O}&yu|w=x|oVx^4`y<3z6mV zXw6*>Yow;U)+@Ss0vmSojPo1&Ot5x{HN#E{a3qD0no(I(NU(~78a&kd%Wp=N3%83t z{;Hjb+dbdv->(S>1QGv+N=2Ze?XV}~WjG{%Fv7AjxRIp~Re*K)1gd*&cK>)N`^Nl) zJ<8iR<{uMo)xpORvJ8PY@N(|I6O&pA^|ift+iQ*?u)Tcjy%9jW{?L~jwH?|R$8hW{ z_lCWHKj7aOe_})vUpJw{qrAcvJ12BCbE+p1g@01W6V2p+Gr>*PD=y;DJMrBkex1FD z>e8rr>4W7TxVzg7&9uFO_MA;l$YO?wJ@cSdb2yk}BOz+*X zo{+9X{b#KbQ=iLV*PkoWO37J(p_k;>%MWtkS&eTmPGmfUjDif)W3^@Yd)-rf;7eRoxuKSSdP!;?>_f(SL-vIlhcV(7JwW3O%N&EJTR$%8h+o|L*GDa2=&$BHQ z2g3qK{LHo%L#=gIbR;VW{cU^wC+*!+)Zw`wYbh*;zSd=?fsESdllhsF!CErfL7VR` zpHP6}#7I56{Hjru^LX~ji(06GWSaaibBx%SRn73|j}au+>K1BT@ka#0f#8AtjM#qh zUUu>PFu3>cu81uM^6`Gc^8V%Bz*abw=+^cWN=1I~rcYWUs<#N7Q@TsQelGqVDfKf# zk8-bEJ?(D^<265hNIs*sG49kh3r;}bO;^ht76Jm%zkbfTCZW*pK22Sbw_#5CkQ*r~ z7!e+|mGva?0o6kS$$@fvF~V4{b^0%6j7RgP)Y(8?XjN#wPsGj}$(1mLZ<}_82W#q? zt<3cQPj9dp2ixRlH+h3ie|8gROKt5nchU?ZvU;4w*H1=pT3~g6f5#1wruM2|aNZ5x zPL6SmZO89N?wd)KZ8+EozKaD&;e1q0P04Y}^^Ns}KVwV8!h*bxGg@Y#gp#*n^mig~ zzWpz}bt)C>`bCa+>%ND_F^ARpL&@M|#!vc+9pNCYSKmChE&^ijr2KgaddQ@gtoB94 z2@)25dgnC+068n#@R6G-z&W1pi|Dc+WO6gI_&aS6ceZG0{fV>MxNf1ED4u_T?`H&0 zBIsbeP`|vIf>c_gI!?9)!zQVNV|m%3h+k`1B;kVs_Kv367QcuUq5Ex_xVxqL3l z=*vVooh;7?I$qF&IpdeZrC``Dy;f?o7>X`kI=bZi!UrpzRVPZ~@kN&3x{M24x`6&| zLPn!%02Uve;l)aIL!R;Q(1TeU^r5zx`~<2)ajRbcf$i4Nekt$9?vK{!L&Sf2g9C5> zi-W~==KkYnEk*w04QRUl^JjC<{D-rR>47OPjhfIy-r}K!VsZTb$=u123gEgeM@;$F zTu5?m`Zb_w0P*z@tOgaaz}v}z&NNjJ`$T8k*>wJ2f7MhV_HEdaAS~Jhnc^J?2Xc+k zFmA6o6NSY@yd_iNTii}#@Y5rGjKCYATGVCn7=r$O5OXzyE!U8ujLP@gPBS zV?H^S5W9{2lZzgf0Yih9savP-qSUwU#H^nu;alT}xs`D%xc^BYNaZ_k*bxGq1Hkm! zRXg3-dXzv1jp@TopomytfqA_y!vEd8Ls8e?ebtEW6kgwLg=d)* z1-ILp^X_HM#_w-lf6!MRbKVs;3MKBrPqPOE{7dlf3H&`0m$JvYXBxi${#FIam{l2- z_64zZ#Y&J&RKOHvedPtcozTpw=Tmt*!r@y(XA}EokBxD6vOA^l=T@4yCqob1@a;Aj z-Ik2lX}pJYZxz5Y=6Czj#BbyHR-SaYQwb0le}<_Rs$zt?z0Er~Pk_-f!)k^HwkZD8 zE4p`C!RVQ}xq*9RI6Ruiyyrr_$N%ZNHuVOZaj;E(c9S>Q^k+A5b~m}CfS@>yyjb`r zuir`lr`L9jOeH#?Sl#+G)r$8>>llmowKpE%o3)AY9({Erze1XKJJO2uNt9|?B{!gV zH-!GGD+@a9rPxk?yaJUjb7-jssR3UBQ zGNSdZY2@LHfi{<`ovat#p`n$=?!Z?uAdgY27orjq=#DERXUWtVI;t!DP|wI)maPuCxqU1PC`XXc(ya6G zWN*wv;#8B}QY^TWqSsA#lnj>VS1CbJb{KCRZHCCYD#{QY7jSmCf)tIq#7Y_0ab1^ei3{o3EI>Ee z|Mi<4tVrsry5=pIg60BkLo4r-K;ii7@G(^dwC9T1-@k>~h(lgWu21I}pxY&8yJMdU zdToBamFd7OqUo=J%hFqNiN`Y^JmxwOKs|gi*--Q87PwfDHS~Evk4npHHk(fWhq!Hz z^G?|wc1+8eM#9lym1OHJJ`|{D3$LSab^8N+cspt(+3-EX4B0K@jYIXTQ{vB#uP@eDZ2kq!;DPb2mG zJ1>PtGtl?gAQshxFQh)RI%S)}L6UKfWA8RX8Kv*lw*bL}4vO|k6%{U*9l&V&mZ%1= zZn*KK;(gVv@m*tBivw4b*l6YYlmCYwE|96Ky zOO90pzL<65l)X~%j^=XIXxMjrYh*q=o5QBkhc1Kr$mlf=D~=+$4NRu&*7^(*L)upRHMBErk=rP^{^=U3%BPr^uY*Fx+Lfu4|PgK zILT_YP@8Fe?T+X(q@_0_ovf(~V6g&Ap7?pJ_x0!q)3Oc_kzQpx)9@VbH(X`k(iM!P zBw*m{#yYgUZ+dSoF9qHe*tv4DJd^ZttmanNQ6g0CwVzIO@`Kt2j2C1Eg5b_5&4ptE zNuk$;aNsf zzw+Q5(K^b;dxvE#iVWr!q9z$ooZT6vD>Tj!g?=oYXYg(!sXknOS0wd{I&$akz}0)X zBp;53x5|wlC^12g1(Jh46B8%372r)^7-5sav2-K^-9vMD3in`qXtTnw1 z2BRaLH6qcDV0fjzP2HgtiM#D-5%ll@?{sv&JoGmLoj(2_&2%b|?6#{kERh$0vZVNG zt5Faf(eaB`Vh)Et#O-A!=E>MoiH~E+QW0R!Eun=dhCJ-0!Q!F1K6|+8Ek;-I*cVDS zZ6E2+w8M0j&gfjdNyg88bM3DngkxvpSdAa$LFm|HdiGGsMR;1-&wyIZ1(d$u^-3N% z4+@6N9y85c!5nVhHRFu)f)S}&Cr;fnL?e&oPaWEN19Akj9jM?*z=Z2ddZNjEm~NoF zjI2p0R$Ni@@j4|QJ3r2Nt7}vZ9u?SlSR)rh-7@ry2>kNEpyx}UdJFKFOA zlePnyVBwEB@{9Z$3iiTM4-4*9SvEnB?0%?Sr|LtCBUSh7<`m)7`Vjq@ICIo5`F1PM zCm&45^&mr%pa*tfaEhscIRrD6Y37i+fHi*h`7xP&Dp6z(u)u zRoquoUXF5csGm17flO2ndYv#U!3z97r%5{s;Npl^8AWOSZh%tTa$=|;i#R-gSOmUd zLqGrSd@&jSgF>tFiZgF&5Ab|ucWU*t0}AQHE8pBO#`;wMK^Hsd zuAQ?YV-mZieQI@80MGm4N{(OIXlkaf)zv5v%YMOHJMU*st=z%HTG+h>8s5)K8gF<- ztO!$l7*|(7WEhhXMJh>@+tu&#lN`%Q=im0l4yW2ewADMrjHn1ayH%Qu6?bBtc{e*| zPS9Xt;Nzd9pT9|Kv=f~pE(LICEcN(V`WWDA+AZ&ytd5Dw_|9!zlf&-0^25D&y%~WX z<$|n--lR67%=+pxDP&3id2eCiIwe%lns?WaUJ9SqIxo08LZtitY*LjmhnOENtK9r| zob*GHhs7a&jd}px_1>qOiR&*X`$BneR$sNj^GTHOsn$kF&?HuN-@TU~mr;}%Zr zP>;OE6eFV0wqMxO7adRWQ!U7~e02)T+iIB0aNYyFSD5Fq-yI6lV)b>8nFRtar@s<&2RZ%jNwGaqaQzJBw>P7n0kTRu*}w$SNO@?J&(J9>>N^9%K8YD(tE zI7zhytSnQa4LG*~isa7;5_c3+wIWpQsl_gX{YIt|6emet+U)iz z;$C&%JoOep# z`L-O=gHY!4vJdr$C!y8){+SDu-M$L)r*CKycZC+vy~NKwX+A2o?UcU;eDE<49lL%Q zWVgEJ&P5!-4#cvoec!tiOyL@@am``sog}jr`R%_*PjqKEI~SL!9;+<@0i#bTHOz_` zc@9m)t}jA6s!27Ja(V&gfGf$w%{+xo{p_Z$Yg2Eq83)_sXE%9+O@DS1XJ5NI)cIIW zp~L_4XJbrjpKgB;29)Tp#8jq_pqhLC`Lk9*qHhMKy}`)fNMD6;3nKi_pS^UIpPa96 z2d*tS|0D;Lql3rQZMtp-fP5>eY)@J-c+wQoT~HAU!(uD<@oJOdY6k7P#YF>XeDV>) zqb>e`_2@6=KL+;zNzUAk^^Fy5ms^oDb8~>=^*p89#*JW0M|*zbZ*$4udS>7d- zxaw10X7^i}C!91Iak|M9cP&t}gCZ-5|I>dOs(HiOc8Nw#l>IK*2P zc}7eh4zC>3y}NIkv}5FB#=$cXOy70n^J>k27-Iyzcn=Nqcdk0-A$b8n0@nPrvkcmu z;(OxrWgAkM%l>f?n-Geo73>CW1SloNC4GYKLS>$l29PSvNlt zPbgK-cP;J#KJD|5*Kb=x_Fw!CITl4!D@s};yhkF8gX9WP9y%K2qNFlvh8e`V%# zp_MrHIxh5CbBP03__9N>ox1||D5;_7y$2yOdbmDWt z42hfG6BNeCZZ9R)HH!E07VJinIyUsURKE4zb+j=(B1eE-s?}JM&*6~F!{jboSl$6WZ z-GR?LzWsIrujO=!rYY6<`78eY%9ko(nzv1_z@F!D@>c4c`e_{?q|XUSMy}ZW%KQs` zArqkLNqR+N5(afSvPVSq>tOx6R<7%JbI_LM1>4)U&(Kqy=Ml$jpHSHaYJ9|BY9Q~n zTlaV_L?aSGC#a(>7HMaMj;F9Zpmtvj&L7s4Bo+N-1_0nvK_8UyqAHSwEs^8@Nt^rXH&w*NG`mx2VE!*x!*zqW+1Mkhi7$G<;{x zgr5s<&33`k-h7bsKz`7;Wz5 zqUZ+q8=?pG60bkdKPQ5SW6mjdZ4|RA>6=GJUJ{{MgvS2qeb8)@p4D^q1X5gAP;znN zfhDM#y4Zu){rX=%*k+!>rhax)*R`oP*o=d1^0S+~!KOdEiL<}1UN@oBo=29TIy~x!IS)^2s z3Zp=H_k_XKZHj?#S{_X!3kAtDlCk2;qeinT@H2aS?Ak?}0X+2}b zZp1n0Z{RHV1CI)|uZbIc*4VR~LRgSdFJqT-G}X@cf~QkK@kamR>4r04m{)e~&#XRV z^0Rf&eff0bIhq)ZieoCWhtmF-yF&t(e~N`V9Y!$cZEw2I3T#!B@)L94Q9ZJDA2=6p z2RZtW52Br&czmA_5qf!zh|`ShA~iffQ#Rw|>W2=h%wdsh6OOkaO`*X@|JHx}t)>dk z{taGqKOB+5n25RVzSq>zUmtMT;8itT{Mk1672AmC<^C%2*^S%%;_LLI!TGh;u!8qO z*m+Tvxa||2q~VukIs}>^G-{Yw(8L=};yJs0lcj{XDT8M z9GN7JQu?`(*-+3V>C023qwZeX6L1J#u>Y{iUB3n5~H{gFBJ$kaPBlil3ysVUbWB zhZl&o=;Pw{>1O19XD(~St_}rU+RN$j;{gf_QQ&3ndkNj?{rP=;%|Xh`A^O?=G9+mB zs3bbJ1=iVb534jV#4gpC`ltM+qAru_%A&T1`2C62MTHn(RRT557dy+*NnyR!>r=`= zOD??lVy8Y3_nOTzjn9O($4@j9c2z^do1TOHE|rMTBc=OA%myQHCP_WO{h^tq1J^@3 z;*i?hR<8Y=RG4<`Zv2ta2Z$#9#K?O)YwSjJ<^tV$KWuxO0HczX1(@yJzGUZH025bM zQ+JD{K|w!}EaSPShcb)y#UKAD4LbddeJ;V853n!^VMSm1WR;s*Qsy7w?4p(8S~0w?lo7+);#tC2wvi+n*G8 zxZk$VW`#(e5OqEa7(o@^wO9R_=7^T%+ez&o*09QA&x`VlYDj%j&8~uPH{8s--OLBu z%v0FZ&u;3vHuVOZaj;E(c9S>Q^k+A5)@@l~^|#anDm^O2K2sl$zjwj*0yY7_;?qse zF-{zJEm&?DEym#eay+4vl32=M30<<$2JQlx_k{s~`eJD5x{oXTeSw#+bI~7OrJOrf z-DU!6dij>lO89|yO}R`Yr$~&4iW0geYQ{FaR9PhV~?CAejhV5evD#ROyxnXs}1&0 z{JO}R>FjYt6Z!3G|Bo2x;9Pz|;&}oF8WR1c9jrk0lhN$~auJyP-7u2{=48xg*~F$! z*%Xy(Ck&Cp%wfO3;qa?YN0``o-}6yp0!FxmQ_vzNnCsNJ(b=C#*x1W*%P07`8gqBM zCK|4|BbKa@gvu{hV4UprUp&5s@E|=svu$D^^6;2064Z{wh6~yJUS0CT-iXaFbAOA% zUaSVqWwu&_TC2~}tULkuc{Q&G51y%^*w{@apMfRByMLXoZf|bWB>66c)yngb(MnS$6LBA9!#Zey;*5r68>D) zc5{i^VtBu(5z82{-*-3tNaY6Z-jJd=g>3YXt;Y>ea=Jn%&WCN_ZU^73zus7wE7#d9 z!_QR0CwG~a&vuZoyyf8IBU3~|8Hr9hf*BAF|K>Wig#jb5_J8Bq`JO7gMQi8M{acVg z_>5Vqn&bCbcecmtG~Sku%HrS0i7b8><6rJVdL|i7OcOXR5KcL*>qO)KCx2D_dV1qN zO8F#~h|VYKc;B42CqvN@}Pm{(MsI_dYm3)71;<(^H!TCZ9w-?Q>de4Qo z4iCDO8 zl4&0pfQMi0CF>b|J!8@ZBszbt+*2JAV7%arXByZt?!u z!It2Gn8^m*@0cq*#v2YU!bz+m{y79Ggu}9@a?m3)FV(HsIj|2u(ohtcVihW1g_LU! zfy+k&1Vsnip?Yl<(Q)b-B8)BivZrWavOV2IHcwf&a89(p2lqUQ3O?x#guzN_0eBT=+U3l8H$J-9uBgYV;(q4*x zPenhN{mF{2)5L6s+X!z(HR*zN!1Ft!#m>^D#8efZulM>HP}A07Tle3M6oN%!qe13Y zkaO>;ptOh9%ZS_DYMZ^ z03IH@Q8)6oin8BV=A@$dBg)|u1!GS&F=B9VXt~-42UOcosJQQkHzdfN=9x7DfN=J* z#WiDy&p&)+PV+E$hsmEY+AWRTQ`Zl^G0}l5LQ#xC-?h-0M-tt!PgjU?*`t+;tOPJq znPMdHm>DC)|7wpC86a-fH`vU(-OLBu%v0FZ&u;3vHuVOZaj;E(c9S>Q^k+A5mUy^E z@>1hC+S&7;Ka0zKJf|(9Kp^234|Zk%(f`k%#rOA5_m6A9L!c@l?dK~r{Vn~Gq-Ov) zS9(L$!u&31I{Y?|dAkQ3xEuUvyw(k#RV(RQhkEdcqvVs!D`YTUFZ<|sLk@UW9mKZ7 zDGZMvpS4{m)J2LX9Xve;dhqM zC-(MG@QMl~Tk5?{%5}vE{?Xzq!E{|w=;8se?3*N3|P4$_A-}(lQk-EebRNKz5%MN`j>Z$Tx9;`gCS1ae|ZXs z81YX(i?SB~>AH~c%#re ze^@ynYuanbb4?2Fj(*bP=DZu77nq6NHNQmlv^!R~qrC_w%c?UfP|6?&`O`P<)cY{h zV>bJ_LKGb7+OzgMUuskJ&mC#p;o1cH4(IyuNpRkV|JCMH3spe7GQ~b{b8OqG&SibD60~pE>{WL#SUhjV8A zOc5=eg9ww%E0j=SO~bZ0ZM2U-qZK$}4391zIYt{NhmV7jLi7TFAdrF))(&It)~C09 z8B0cd2lAq}hh{=5A z&l~*b&u-$Z?S;QHd2RjpdhnX?1y0!BGc7{Q$$s#YRZvBAc`I7^`xpK2^T({~R(K=R zJ>fQy1?LkKAFy=ie1Cn~W#FVJ6f!Qe3!P?*DmDVHaR0^i?Q}x;=Z{zE^2Y)E^R!E* z%IMcs;9*?WC1B(OB`coH>E(q$clN?{wr7zW*Zq6xb{ziR5L9;=dtle?UYTW0L_@O_ zkqXYA_7ElKWje&%MsaWYH9kL+1pUI#29oAOz+ILA27x&lQ1p4Cx35SKy(+$8a+%g2 zK1pZ}-j`<#bx*wfEqzZOpC5CY-S39apXr_UGz#Fd67b5?5i>KGqrKl1gd%I){nq)c z;maRcj71A!FwA8~vv743dKu9jd00*vdp6a2pL0tX=vZ`|D!+j1+w>oGZ;#Z+cE5TS zPq%U&F>pMb7FvlwK3U7RnT_nApVDi&>2VJz;mC0-@3AlP|IzgxuOmaUj3yJ+*&&0~{v~SU!bVNWKlz z20yzve14NRAg5ZZVs~7>)NbJHQ&QT=E>rw>CjR$a*cxM!&XK;_Z3(qm+#8qVDF6C* zb+}^LBTTYNyrB^HTijTAcj5bySm9kn1m9-4YzNC4NInW&R;mG!P(G$ZoG0`bj_Wvr zo*dfntI0W0T7dCIl9j|^BGTDKQZ1$2hdLQGwl%G1@%eBisDjVNb0ojMd~o$;Ybm5KPZx+jU7>b;GV_d#Vad6c|XHO-62NNBnB2Pw4f!C*KzDNzt(z0)y_c`NB{ zPnXH+^2F=`#*!LvM+oyeZ(%kZEC+w=<2jcfd>WPxS1dL4Dj^5^1f9Y!@+gLxS!`#6 zI!tuFWK-MLjfxGEUE_B@MYVmBcLlfHLnmF`y4+fu5IJstOp0AIWS%Y9O_2%1cCPz6 z;BHro9wqlsmHAms+?vV*OG!;2BzQucrCZwK$~Uk z*82Sn$9z&aMJ?`{fl$3uo#F}#P|2y&EkO7pl+rkxP?g4bu5oD&%kf|whg|Wr2l#fG zW7{p43Z&b3Am0Dl71*gXzEA&DAtbr~G8}gj!Zgy>PKc!^VWcx)TezDW*1uz#8?D5_ zGyU(ZgQfE!UH{B(1N~E|aa3M_cYg|!HNWD!>GE+n)^nGi9Dw@)!5>mSHRbTB@1C!Y zJ!j#=>+1p(fggPTyP?wk$9G!#)`|;@Ayia%qthz<;R(iZ4dio(g2h8)>VYi9NZ)dy z<;kcA<9%wt5p7W3WF!4xO+Dg#6UxTb!HtcbQBiUz7X`m%rrfQcM?(XLWV^Fb;b>)B zzs%k8!;I}nZ$t6tLK16uX^;t)a7ETae``6^mz{fM?{Wqi`0`)&DYwI)6KY@nsF%Tc zZyVXv$YOwG%DUga0Ro$sg z(X@*d`U77R>VJ_)WMP&#aNO9 z1EsNojU%gk1N^nwu{jpTdepDCl`yp$0yiLuIB#dn7fB@LOWs(y*O#HSO8?c}j+!Of z3jVk7C=eTYX|%u`+0Hn>9fgi5;E~zYX=V`-$$m9{C*aGX1x|4N8^RVB|8oL za|SybMC$Q}a8&(vhDeUpNl zDi+GaVk|FaV<9C3>lT#;(zb?L zAEhys0QjUdZmBQdXB-E$+shfe*B5ZO`0*;Vwh4NH8&RUnpd3m)_kQqIQdXs~#0h)S z{F5try9O5H{4dDwUpM^A;}f-|vJv0DWBSQ)Z+u*?i%$1&p6y9_r?R*+0QJ%k7@d6q%J5D@G3Wm0r03?nhU5^J;? z!SMvD@vzn&#_@LS`)iN!;y|Ncdo=dV&|~X+t-s_a>${%ItanGrDxd28 zqwoObOC9C!MsS6+Tqwb|8B@z&a25K=4xZkc4-O=BF(+YKg|S4F1wi3 zy%jJL^|ETD89yL=5edI8!bgViHz0JkgdU90DG>Zwg4adx4TwG%fzJ}S0WqJ&_jk3< z`p9!1+rMoXY5i|Li`UteL@XFInH^no+rAH}zSREbJRAWQuUs3JEzAK|bmEsu8yw)8 zPxn5ZjCX(%a&a{gYFdz_v#q0R#Sm!R)Y$V_DHFW3uuz}j$V8X9dpimmyitF8#osHo zevI?htbf6qao%RuCg)--Y=*%bm#-lM`clH0Q&hB@E38dVCK9@r%*AzBT42>;W779g zEcg?DL(ZHW3C8s^o_P))gM267^eB}oph|5I^YVTz7|b)DyrQj%kKd7KIADX1PnDr% zy5P&j(KGG2;75@QOR#@F3Vztv+^FpWZDq{YYitikf8S5J_U`kB>!hROCZ4J(3hFbr_4fswtug+%~@4tE!go!+niM3|$ zftNEj>u-|BfA8==*;gT$Ya`v50jQt9Ja?qEtzKg<`9b=jQw#JPy&a`E71eWY( z^}JC&6=kh#R+?+u1y4}Ye;i}AfPs!kP$Wzqsf%uMGuy0%)_h+)BRPmGTF&vjcBpKI z4k6;9;f~c%;-|TTkf8&VQ4tV{{!@dFX*n`|_#^?i`HBl4AFf0_t=GP?nc(=U2Iu+* ztW>ah-JeWem0Rd;>C`1BovZlyAc^poB>`KT)DMEa_mGWT1^GdpFDB9WB>tdQD2DBq zkE~t1hypj+M77|=$zoa}l$Sk+A6K8ex7#WFIH-z!&oZ#%_Vt$GYhz+7RTpIfZlz(fV|6 z4M=f;h7*!uHO*%rI=Nvb(60(VuD5GFNZ+Xy8Y8d^q_SP6*jQ>}j{Z;1r42?SXXe9Lj$#gc@S{}Igk^OeM&a=@#IIiz zs~t@T(6^tQO=&)dQm@f)Lv)GlRKC2^IR}7?2A8FAE}HeXE$Ti({bl6RqSiq z;y0DO5oq)L_6=w>2gSK)MzERX!@1?_;((Nh2>)5a7m@JmB79_q|L1Q&=xzx;7@<=j z__GAB>;Lf${_BGg`0W1=H&~s|5_MMNNT$oKjvjP)d}u`}DHXFywq(h95(5PbB~l*Z z{UG6Yp+*jNSVKF*;7@f1Pw6C6P1bfV8~pqP|5#t74A?r$ zCoG^3-12#Pxcx!^yjZzaq5Etk29PaA)MEGEOSTAD46S2pP z{j|N3jVgBt?rpU-1FjnkHs)RN0Zw*C^>Q0cG51^IYNk6i(fR{I4g=2aaE1NSb=I%C zQ0~F|`Hw#P@$ogp>IWF}WD*;@+46xC)?JSoolNVilj|D>3=s4)i`bnE9E!F%FB5ToZY>E$R&5C<9F-<)nF4+v6cZOX`@#8%lfFZAZ}aO zmSe~$=WJH~drUf2}`=G|Q*)-gLmQq49XZRvgBr`Y-albCqaD7b% z?jwV$n*Ez_e*;);;>pn6!a3R{h8_$WOggOU6!86}209(U)|GI!F?d~6y66A+26&|! z*2e4#p7Jp8*_W?_S8)Tze74u^C>E`uyIN5{kD?!7nctVHj>hrfo+^89de}2Ap za?})Es0%tj6Iu+0zm+ z?{YWmKbD`dW(FUW56xOU+m3d>ZH=ipYL5i9?;f2o6oyBWPuJRK za^lw!zmn#MR>#l&wOT((8jZ5x!1X%@9%-5=)A--Wsf>J-_F%!7>Td`Q#UvU^I$c(` zv;HR!7~4lG=rfkonP>k!4?JMogHbD=yGRcMSTwo2=i3O5zo*-AdY{)t@fsTSf&w}y z8!A28({BQ|EPm(PYDhzyjn8H@7j&YlhTFQA>pIZRFEO7pt8pDHSo`;;!)bWVcO;cp z*8^j5mChMw$wPm5KFzA$&%}?rJXek_0dp|gyW?H_MRa*eo}KAK8TyW2FVf8ntgt0i zxGA&}g{dE7Usn=_!4#!CyT&X)bCeF|k?)RH4jgpK*QLVHjP0(s&aUeI*mvGuohNG~ zn7Cn5DV(p!n+wq7`U54sSJGgM`fDAbx?)Hwn~USrFvUpnDxT-I#$%*L(LwrFFKogh zX2Z8mP53)n##&`D269OoJrQEHLOqFN4R$M$hy~+_b;~~wv-ac$%5SKFmxs!8l?!WO zYr%_jX48vs64#fd4hO*YqjZ+>84hsmW}=Sr&>3`X;$qoJ%LvAGUOyUo5NIfKO;)s( zqjA|6x;gGCXafJo(I`VOWHP3Ba7P^yaTQ;pbaG;z^=G-S7@C7){$BhnIPS=LVxQGn z&LAWzA#GUWAjG(iB8NZd;MY$hca4WR_H1QPn(In2?ENTxEO=`qVmhZho&O#0yOPT5 z#=qmwMNx%<-kl6xA&2XF&3#~g{*mmDD?aGxO_9)hCqm$+oo}z-qlm&cw@Y*6>g^HX z^G*2A626FpUl-vcgWo=@{sx5Zme7L{It7A1i?6rJ>-v9uga7(q1U~!!!wpvFvqYUG zt@X=LH@%NI*^5b0m7!Q@u)XHit@w3PEwk$9H-#^x+4X)2DS@OnwMV&LS_7tlvlo2r z+%U0yUuPy~%}{hAcKZRx0Z3}B^&54=`??yxxhKbU0cmpG=VLQ|*mIHbs0$xMk(l|r z4@J9D80YovX1)`Q??rj`DZI}`BI#&l7J%HQkB-UDlwh%v5}%p00949&@1Az@7^Oo^otYbE((msyrd)>wOC{&9CUc`>^q1JZ=+gQ6pl7qjZK;R(S% zXOg!bF!>R)#LG;@J}?e94$= zaows;0pCx-t0UU0>peeUh3%D0`XAqbMA{~&2^e)&xXKSZmiK?S0jZ}f7JOg#ZDq{# zI5t!!5h>+=8@X&`CRq;WfGsPFyL3jRQ0!TrS)m`%5T7rVDZgOUq11rkhno23gy_~_ zHkeUmwdKi3WhfUT_+ljBEWRFzWE_FxWB3j&3R%^QT6m;3uHr#uC zbv=(w|KSO?RsO5%v8$g3GOt+M?r80VC+DerMIPKR-s6X7HHRa9J*ZYl7@v>e-0XuD%w?@dm}Z{9FOb?sT_1BhwLo2}JZ4##IpGy8aa8T;vye|z=3QfL1# z`rM=r`EN>$aWvGfTQFX0U`r4yHSYvH4$Amzjqdcd(U{8b6_k5A2UDF-TRst1jG3HX z-t}{OH)d=TI#L^P5izg-Fc$QOQQuGYe0|~!d`5EQ*#0%+``eL4N^}5ocwc1wz)7$> za7y{M%^eiUgfrw`yI@7MftUcJLfE*uLErr~6*IaYiFe!UVB<|2%|L>TG>%c8N_B)=yMP2XXk6dp|BT{R|$?7kTCy=^E zJB_d&RuXlub1-(mx0=*-BMrMgZ_|_UZia^4jPJXulZ1}c?TJ?tm1mqUA6>0on7oVD zT8qcU_S=^q^3*PU0lcu@#EPBg9;&@(G!nkgvGUZkLt>?uyRmSH^ zDZO?CEu(98vK@{>?T5bzvc9Ck=zrP2Ed2ct=}cm3wxB6~Jv1`I2AKh=hu*2>hT{uV zgGr`MhyE{5f$;ez{AUSYM8dC&@R1?>4G7&Wp$8*$3Iu-^hig}PT?F5N=z|gXEP)#k z^I4+Kj+J${F(2+jYx@>A1U<3>`y(QCe5heS?~T}J{Xh+vm8G+N?SLlqw2)uQo|M7Q zhenLzPyBm3+hb>`9Sr^&wa2Y9PC#M75>^U!N6=GSm5ZAH>{id)jj#a5`3lxEEygDM zzG!7Wx`y6~U0rC5i9l}~tfV4bgW$5?*rAmm3E-2x)mK5lA3UiR`Wn5-6{NnjJg*b! zgyXAB^NQ{pkb`AQKi3}t*spQ#;Ze>#jPZLpR~X-8jf!*681+@-gWIKUi>~ zSvu@jIAcADx={RkPNE_i-8g_>k3+eIo|vxo9`B+LR`5~6^R-uN&5&@5;^&udY*1TY z^0@vV+tvNh_;M6qA4k|H!dI91bXSmjZ+n++0{@T^0Xfkz^acD(3e%yU&Aq`<5Bv3P=YJU@wN)p(>q;wAL!Rt`! zUHp3_=fI8e-9e4YaYe~Tb+O`}&>_pNBUtK0)M)*l(`Zt0c&y1S3u<>AQ|x+w96v7V z(?6|=jN_RY6Hmt+zVc4H1XdxT!99{P`>(;1n(Llx3*gr|qNsRZkSmTOWQqEojKTW` zau0+>Jix_cd;;XpQ81_;i+3r{g&HacUSI#0f-Dy@ejfT8iY5b37e>2XMYq*6x+#2g zbn|$)&0&i!^eg>tN|V_g6yxBr+p?t;JzG;Z++ZgS?iaARrqmZffgj=yueURPS4150 z7TmBKB%>hHvU*rIuft@Qc?Z`?uB-$UL}QI~2i~NIt?*fy*pBQ01FSZSt|PhC52Gr0 zJegP>=zGr*XO|2 zEbA}DGN%xcTSMe25I*08|19B)NceTZ|M|!e{sx5Zme7L{It7A1OYph~z5&q(Bk)-Q zHz4M-M4eUH%GP_8=`(tHQN?EMYd0jb?Rf!rJTEvU<@T@!@BL4gxg7gsr4L;0`|S}a zd5r}3467W^IE+p~smX~_Q=q>5b>kL@`^_Ymu9Y?o#MXAJ2={d&r0sjNC*DyE=?XPI zuA4Q-s@0#Wevrm>ZDGmjqDPNlkq--pQVxc|U21MaS+YrR*ENo5?*j&C;F9@KbL(8p z=$rKEb&|(Hkl{gBPQ7^0{4z4Gz55LIcZIpP%t{p|V8<2N<~(7RS!6vehYM*4?$~1| z6oS#xH1*aGcz~=^tOM%zaGW%&b$|JxFwFP!?e|7MT%cUQn~dJSyr|~DaMbw2NZ1hf z@DR`V5BjK}nyfIF7WVJsLvNM*2yFbs;~Vd!EI~qQWq#)EXl#0aQA*Jf7f{D+LV8-n zuCA%6Wy%!2q5Ttexn<=+a(j7T`ox3~A8k=8>*1E@33B>Ru4DY&>cD9BPWYKUra<`I zKkgTj=g4o)?!|E3Ek$v^d)Chg8m;fs2Guw5B7phjMID?!OBu@Fc*5hDBKC4rs!plc z0&HPkp2GWJ@DX`qVUxZOrfWnp2YRMJw^flbpM`^5(;ntBk=XE>`>9Mar|FGDep?NV`2w3dOfsOZ^Vw~btk%)ryOu)XbBh=Vq(q@sONxMtfDIPFqkOfO#R{Z!H=NF` zUWW}<_&5f8ugBiqpu8H5jHX=eJ}5KUpF^)seY9~<03v=(Bem0(CD3Fpwnxr<62~6@QueuM7zox3f7Q7# z6AX?dV0j{V-U*wsnIUJ{K9CXqM*TY}3pJ;fe0Z!XkLhUC@~Y?sV)T{6EH5u>KF)hFI$EhXczPuBPbyw#p&kow$375;HmCeWa?Fm)|slOMWo%v<~FPQuP z9l5{;nyQiKM#00BDZz}~Briw$boZsQ&77jNugnoY-`?CwHV_b01Crddeah!D*}6pF z%qMnZb_F%G<^}7hfyz7DZjLDLE#0%^4e@CPvQf{;@p1=)*X?Pdr08S>OU3rn@AVE! z6xt6{K8D}k7dVkl&I_vAS5s0TdqGBf;fR(fBLD z+d+W;e|0uvd}D9j0v$#Es9;kM)4_f^v?*-e?gno0o%854F@=*>mCxP6 zBVnS{@Xh8cRG6?+^3Q{mGuSE9XW?E4kAuVCVuO!bS784(sh{){jm0v@59j9x?}y69 zIhpAk7BJIbzL4#rJb)K))8(<33N4t9xFp04<6OhLM)A$xJFLhT6 zu0Q+KTxrA^-q&tk{#>hvmPYn14%;WdqvVyBrNWWex4NG}{SL~YTw=zzWk3U?fZtyN z|9N1d=hfC{%vfT_?iR*y4SKf!RQa{H<8Vm3x$kMK+<&d@W;4m!&2ui~Lm@pUp5ngY zfya|+wcJi%XkW;)k0=1xnS?kiH)oS&SJpB74bb_8=S#Oe=V^sY3J)?qa)OzCt2za= zcl|MN%hVaWBI@^Uhlw_rh+pL!ppi=2vBld7*n-T@KHhy=V0tev12;eq?_^0gv_)e~ znsXZ%byo7s$xSkvFDbd7xBR+ed7V7d%?%~DMv*r(k;}%WeP{Ak$pSCr^bKZBdB%XD_-GqK;MN>I25(>Z$EQpS?fjqLtT1f4jmOy*8qsqO15c3sUL&0|MJYcH&G6Dwieo33^Py=B}LPKqB=2(kT zW>)#;99Wg85a>P^3=Tb#;JMLy9An?47k8OG8Qw~|=R0xg1aQ!8GOvp%KryKy8MZ3S za7wgVQI|pnS65W;Q`8->20msbHp^eMrzd{6+lO98;?iTT;d}Cs%T2AYye$#nLT%;e z;fX|S%KJ(A-Ud(bBw|5sRId>pN>ljF{U#NDW!ts<(f0z9Rx{OAo(;wp_X&o3l%2o^ zPB;s`s)_=f!$GO_v_pW0{md1PTqbg-P`d}uZ8mUthoDQ>k`bss+wx(Do(H&dF=vzM zIu^3jJ@v_LyLQqKS{#j!3ge`C#Ih!KD~*$%ZT@jv;Q|Ztkmnmu{5DGq2+nGjcx(o~ zDXw&mOu3;u>ESQen_6PlU)fbiWM_K4M&Cx>^8u8&ewLWEYWvAauC(q?!TL0I-f$0r zr_$si{!i|XEvl%C$?wO%d0n_|^oL{P3n_4O?s7i&d3mgbSx(rpm2N-U}nNWk2A)<=r?jGzZO-3ePI zO;C*@jq0J}iFRBUL8*r{FvEFAuPiwwO!^<`>W(Zrx*ub3o6AKWk%hn6%zV>8dq-qC z!=r;Rl$y*Lz99_L2)AbbK7AQHvwU=TrZWzv(JplDI~4(y0w`Q_-OOkuFV%2-gDqSe z!7?Q*ZUki4@N|ae7-By{TDD()ngoP?73w>g<9?Vicl7S^ABH*XGZ`6T>(Gvpj)syh zU6}Hqw#Y!q3p5$eI^GV~##9zf-n!1Xg35i=S3&&|K+4Lmp4B7)zRe%`)p=SUQF|I3 zc2?QIK{?ZD4_pW87q=!{;*uIPuJ2pTVQgGpvziYkwESOgO`7w6c?zdO{`2{kTkcu) zpFJq@Zq*lYU;255Ul&E>O00a7(<5?8eKEt|0M2R)U&M8{6e%IiSL{MkAmgq9L#F^a ze`PcHv-D4wA}h?6HGt@1E`x6X6=I}z;e9arh|Hbk*$ydS-fqmm4bZVoEk57M-D!z; z`duHtzf9Y-naz3N2?uiINZ(*typOUs`$!9iG?}8y?;xJuI7VYV64J*KFNG-zJnsH< zHWit@DllrRen($gq^23s+bFkd54PU5X2mqMkH=>J)WwjKZ;rTaFz5?OPuT$;zl1R5%rA3XA=oI>9+87cNI zn|}1Z@#&YxccbMkg5a)iIc3q<3YfU?WTc*}ge9aU#Y1>*i<{Qr z{F{o8>DOM7t;23MQUss%bT_|^B6ssCZ`}O3o!rgLTP)E0o%XqwIvAR+jvbAhuev6c zf>tVG+^mPb(qr??@>3^=DA>=!pQ!@jl>IGR{@t6NzMan5+cY|r1mhbLufVEo*nGC~ zy>)N`Bzc7&L!n`CvrPZunx#A>*6==rYh)kzl#(VZ;oFIe! zn;-QgovVOc%%$|$0)KSD#JHY!9R&!#kq@Zbk^|#3e>KUMYGWTZZth&?y9VRbxu?~C zNeel;O0%a|*Ps>tOjip^E)>PCX4?x{V%|a;?+2>QuJio^FtFNgYoyp-|Y9lz~?rHiM!%3%|1DHoFveQ_1Z_7$mfoUA~nu$RGZ`Atx} zIrD?hPyuv4Wq3^1F@)~yF5MEYp@NQYyyYg%?t*qK9xA@#qz^rrUS?eFwWCKq{Q6{Y z>6^N;Bk6tVbuWrj*uy@r&5~%TXTXe16>>K9(jmszZ@lHf{=4}{!pFYKe7x| zeS7nK01RAft{ojEPhM8rDiCd{N*k%)dp(P(gno9r$jsNS3zYLOLcHI_CeZCG^ZD;* z|0av1iKz?zSf=?8Wyc?^v_^ZLS*GaOxeFY^bpI7t%v&9PLmh*n)P>cRnS~5 z)|u>@x{w%@kBKQVGAB@0f3EY5~&k}Vu<6WLx$@C|r z+Srx9p3?$o==b#1W*j=v;2!sEFjgopU)RZUuJz zV`@9%>jX~xug-3u)Z|^RrX%l)&isOUJ~;09Y;rUq132uz`Y-Ko1<>kgD5>>u1+tn6 zl*+U2h;xfe+5%TH+`OS6xBbjf=*|3$?sv!%77a5!P>S;fCyv!^S{IxIl)aVR_!{>E z_cA?>r+bBvk^V8(S9{%H7Db6Q1~!nr?)9UR3v$qD z-o}cNcPo5JpDu-d68adgz3{B`S2^5c3v@@eaJn;CUB zp6x-Qw9HG23iS!6;^&#$OE2g4Do3+`jHg~3y4W^gxgir**Q>69cVzlG_I>b#lr1b* zGz_j#ewUv=c&6n5nDS!{cW$!4#H88$TZQG|4WE@k({1PAmi#zS(Di~6_ka)BybC0& zUX*#Y?b0*a6|1$fC*KZ`8^yR-hreB=TkiknCNn0546Zj8T95nECqk|VsdF8nRF!!g zG~UAsO2@bl&&p_FC%*30bGvjH%x&0gyY-S1iVpEycOj+(si(UgpEUeL=TFj7E)su9 ziM@5iS$%yj?TdmprA_z^Z7b7?EjY;s0u}R8p^p$2_a~mLp|nIPGq3zohP#74%ei3P zxvz^dKjQn&)S{K{)gZ*Xf1f*DykW}0Ff9x2n#?)V`Ktg{>s-qNwPEn$x#XkTbQO4K z{f?wHl*`B=9eblObO6}JeCZfJY!3gl3k2EnTtNd)c6C469YK-b-%RO|0wmsL(4~HdvmtA>TX5UY9!v4Z)XKJyeIi}1kciL#6h1wC!&xukHxnAw&JwU zv<z9W|-pAJ1yBU56tF&^I7lm|LUx8+Kl$uR|r)a zwy?)EH(qFpeAIyd-0l?Ds(QnFZBxBL zuf?H5>&KL6Whxj8OY?gFFBC}M@bG$%>mgsoCq*8c_W&zb&erxU@I$XH{!@qED+lht5)f`5p^A}h1!R7|rS95C~ z+i!Nc6xDDYxGL)F3=X1_ii(y)C zZ!mPX$mnLvUR)1Gseh5YrzhtiCVccS9p}#?wjJBza9$UsWE*EAPt`$8R=mIj?}MR! z^;?TVuim$d1c>x(_~=Vcc$&I8pS7Oz6kycZe)Erx+&I2#C{fer$NGdUAYW8vIL-^o zjyL^DxV#3oU%TkVy#E0`bLlAQL5URF#9_yu{E8R!9*MJTF0=)qlt-zHijL50U}|9G z@+Jh`EcP?EN1!7z&Vyb_N#yZ-mAOr_HZ(L<_}l&(TGemHNBu!YIbi zHrQKnOi?bK5`NdFD@i$M9IJCqi35ZQP2N0c_lc%!yv5!;@+VtSdc2>GCeSe27iaN%mG zmMAEEb!q`eeStT2%6i%-ixi1&CMjFWR5$>&zh*z9jwoPK zXQdo&)?`CP{X;#v{@0*zT;p8M$wmZCd5_(Mp+N3D8;^Oi3zi$+=rwO$g5FdfR`eDq zgrW+X8WFFO(b8OONTft479k&9;PTxQ#JbY!TrU{|*-6s4($-S=ZE`gG&UhI#kvCEb zSZG8h*Iuv5wr+siWGY`2OPSCg+&9e{ndU{^EAYujMG6}Ex0RzYT^aZPd)UGJN}ir= zW>&ewAYa|bpPRO}LYLzFX;#kY;vBtka5G2ut5zC&{ekqSygU$lqxbS@IT`XRi|u9p ziQl(nXP*h#*g>zX=6coN_3HPJw0;ap9&At8p1`V`8c!F}nfWwvB!cqq=k-)eWmEdw z_?Rn$)LB}w=+=M}bY@@+Uu>7o(?iGQ^RB8i?1q%R4=-pp&Cw$lEESAW!)W_IwBGGl z$|nD~sO(nTm`;mj75#O0aEii1uGlYysf*+Hz4|8 z1U^gP2E=@psI$~_Umc8U#&OAD!*Qj&gYeeAF1F;CNtmOFd8^l#T(G6X>!069GNx~r zafHKc04?&5mXuvJhZ-$xm`ZsdCi2yY-q)Q3OdkjZNV^`xiwQj?2}SFi2MQ*7`r)bP}uHjWLVwG}4Bm#o6zS@q0Z_jy<7~ zl7F4t-1UxTyRE20!s|PIZsBF1V^Sf^kMviVuI7Uz+w!9piB1^FmW6#{t0&Iy`4@Kb zr4-(m?uzwx6T{@wcTt~(jZo4r`*M5KU7#!1?{m(rJC5FTP8ej<-qT6!OCn0kf`BBD z5+eT@_Z@r7v^Lz84If8nWN`B|&WGx%&uN$-yWRY~|2~@-*lF~tHS?egAo;02exn@% zNTi_PUz41me$7kq`aj;_LRqEm5t(LGRPmkcfPI1`x!R~u4ckq6G_-z4aJKPWI1xw4;x;rpbtHM6e;Tp=|WTlH=b^w<1 zC)Dp>x5Jc$FYT$| zfYXnI*Xk&Cp@N5LKGjnl&?kha*4+iqUlGe=FXZ*bmW!V9J-;1}8s-=Bhbfi< zc{52*Eosz;rc4Lk{igZa3N<(;?1MdVukQY{`#~mgIs0q!D}Yob!Rujm;ZWpw{iezX zn&A0W4lgs?S<1~rU+y>4nUKbW<&nO<)#PF|&B|>}rDU%7ZFen7D`>S72OnH}W(;L* zQft0tc_R{w+4WQ3w_*B7=!<%wFn+ym8)iQLN<06B6kS)q4Tn2rB!@i}QKaduT)L|| zJo3{s9HGoso)*>zg%%5 z@0Q46B=W(C+!`WJf$;ez{AUSYM8dC&@R1?>4G7&Wp$8*$3jfES{U5K3;2RKqFanF|J52uMJ>(i^h?Y$cr3+g}A;V~a z?_C`cSZ(C5RF*|oz@q2utZ9*n^|be%2pqBn=B?AuPpvV7KW^@e;5eZP&3Akltn`$C zLb{IO)23AD9oy5Kpj&`FkmxlT7D&Z*u_{vkD4xL{mg^)*4rgOQ<3~EIo1`)B#%qfX z-f`sO6q7WQ%qH5|xPi^y=SOJqFBH_zz++i?IF zEuN!dm=vp@iNAmNX!K@D@H|+E!JkE>Q7I$Or>C$Cagr^AZ;-P8#%dpowT-tp<@p8> zx=^DCB{zWVjpEup>VIiZPDeU1>g?8&TaItV`(UIZLx&IXUU;3+=_w3(Oe1xqRC~DY z#vVyOXw@~Dr992p?RcB16_WIx#yM}z#=i%W*1xq(@w)%haFsqxiJf`pXHiY?tGF1BPwNF)l!7<^+En zPb*@39mIW+34i`SwkG4^m>($G{HN9V#wko9RrRoCT?nvq*`jswdOH-J)q8LD{0a=6 z8RI|h)`&WfDI2BkJc51H2>EHFLB+ZbquXr-5x}U6WZ$Y92u#INUrwl1kTrh1vfuU% z*W)sKxLj1g&r=WIfD3OJFxL*4PsM$O!wWfX+i#twOHyx=wZ5FCiA2mBiQU`*&eWGh z=*1jDsWrB)=hyJ#=acnajDst1F_-(vb~Xj+OE?vg&IVwc|CoRNjMkttLrQ~K-e&mI zcVDKSRyBE0G@sQ=tCuF5OiGH@Y$RuO_J7!u;g7cbZ3rr~kAxccQXIF)>wt^%XQIYT z@#}T$bGNP0ZjiiJX5yEm68dR;wydV`5S)%!XENcbj|TIb?ah}Sk$o~N=wU*ez)Zp0 zrMKm**tQRi8iT{5CNaTYNxiv(d z0_A@`--Q1x;fqN4brC)?guelyyCw8sgie9r&l0>Yf^R_d!3cbozzvA`EKz4$2MwkA zb;i+LMXYZ0J4@6jVXdvsVhP_q=&L%q39lFC+CK#72EZ;>d4--dfUZ9M(tnN97+p)B zV#@vO4X=Gt=Q@7E5agsST!?B*z;ac)el%oW1c_m9e^ngD^#~zmJinN1p#P(1_V=q5 zQIf+KZN+>ac-H>uMBIKau+7TG!N<}8BtJU+ZR52B@W?~>Cue3lQd#<&{Wc>MdCgqE z`8mW0(bsJVS5}CCZ=JGc^>os}?}rZaG6jynQ0$J^WM4SYruqB6NzMiW&(As>wuytE zMvllnZn1!VX1z~Yewd(@UEEyGqPj@(b8Y7(+^3AR&>`{pv<}`rx~>woUKdT@c~Cp7 zZH4sjEw1K@_ip$v?{9#4wuz@Fh$3Ms(5Yxi#PXv^;-%arh+Y81haE%^9RX|Bl}`0s(+z8pOl>@4f0MQr6dnydb5 zGp(YJGSw#&PqRos(rh^kaKwEhXo*5ulDk2pcHP^eW_@5ZZsS+;su=g#8a}LcoEvRi zEZ|bh-+|gb9#k^$`iT3U*?JXKapL^ZdWm9ooVOmOZCvnLlKsckRLNXYB} zo64f)$hU67*%!Y({S!_@(%Z{9} zZSw*CQ;jI+M9r)9wyvOB;T&`Cl0D|KF`#qdsvD-%sS(b-xfK;JwR`_DxrF{?s+@iG z($Vu$`vZ9k(P+y^&kkzh7L@&N z>$QyA4di&G##`eKB5?bzs$sPQe#lgIIIb{=3E0nl6YJv(g;PHY-lpXh;J*4@O4I#&BmT)JdGs>KQugn6bvQ0w-txCw43X}S zV=ZJ!P(uCkYYm4q7QXh*zIm-A)H!Or-FHnc{8FXtWu*~=@KnroZM6$zTKdzIZ}qq^ zbD=e5is4Ma<7(4wHfv27FLXKLV6h)cQXExF9dLn9qD#ew^t4fj_oh?tCpM#PL#m-w z54cdzE=^j@BP}Gzy5>_`*XFyzysCn$8)ZQvXSvR$dMydk+R0WtBDGS8LZ7x{-+70eXlT*{E4hNG*uI#I)9|ys^X50ql zJVE@TOatrNNJux4GM=)-{dbigz4-NoijI-epCQf=SUFaj9aI{LG1vOWxQKazBNu&6 zH`JtpYPILe2cHLE=h$?Tudnk!f)pnOlc6N2taCk~%fcPHU9yk4d&?bRAGtpaY1>1i z7Wu1k*CQzxHl8`qE_9qy1l}LoDOg75P80md%HK!du%knfkt=@OU0j!ucRTBMMwOAn zNcyeB$Ol8D+Vd>$OgOOj8COT~JO#>v_Nvb}&g)h@r69qN%?t{?#(fddboi=Y7tRa+ zDD`A%KcM~HT#ox2pvv2;x?BAHqv)*6c2M*~@g}ZQKz(el9#Hds(PlmyMnpd11reQh zjTwA{7{&kkU}jgh_%ZNV5_K!fL6cp8lpWEwRZJE%V~$%Hb@u5SU&qgm=ir~cNgBRi zhVi<#&x70P5FmA&WwZLEhdtLK$%R!a!HFL=cNMSQgj@OZif6U9A(Cn1gvFCJ7)d|< zd4kbqjAUc9Yx7zjOgWt`Dz|fxQY*w=^_4prrTosV{$krl;WS;8b=-#eA2@ zr1GDMnKWm-{%>MEfajh;~(D^ z>gOH;JgR=VG`lw<``wZoSblk7hqq%B3wt!dT_b+u#lc%>X9qH4;i-Y|AGAZ4vQ`-O zBSFtDKME6(^>YmLGzE78DMK0Lc4V!n5_o^?78Fa?8G2NI0j@VHPvhA^1sdy>GA`%3 zVj@T8eE5}oK;zlxuP)(Dhy8oDe#tB7gxT^U0>d@+{~uH59Z%)||Nm1_N+cvC*&`z( zCFgQph>Xh2o|)O3>^-t)8D*7K5g~G}&Wm;wiG=J`D$+1Y^1F_A*XR55&viR5*LYp8 z>zwC#o#*TEyg%TBk-)BswsN>4-RkFcg$d|;tV-(sxd5(~ao_Yp{7JmATmlu_4@KOnI2rSl4SSD|M3m})!pLtV0fJZuAjwqUAW!=UkAhUvv}SBpU>iPHgBETdAJkfp7R)S za+v#|Uj5)fc&;0S7HDlT_t#eoNrMczqgOyQ?KEu!&_Amq#}i&4&*5F}k-S74{K z9?D;M{j^S|52Us0J6kFi1Z+7f;{EohklXLrY=maJIP_c#LU#`fDJV-rs~=h6j|)sIV?zT!HfZ?ma> zw>k-#{Z}rWAQ3btQvbaDNfv0$rTW%T@~qQmF+T+gR)=|bIUhkaBl)TPS*ZMh%?aZo zLIRJy-Mckw^yz6?L5w2<+vs$td;^T*E;sw}yBS&%esCD82Lnos`<-}Nnu)6KE=dbr zn;{x3%53VotfpCSQ}qU&#{aE@Ve{Lw+&{3qaeSZrnI$4ZS{B=!&xSeioubBB`l98| zoLL7fo_}J!{%b!%u&=CeQuaZvE-4Y|E@^A)13PLu%;sHVC>UmDi}PI$ zYJTF)%Nk)UKQaA0)7~AmCT-L!Mv$O$cj3w)pA6L7o5Arp`!aCbUP_z&hsu*IAL#Cw zBxAUudg-J4vHU=Dz?gV1kzj0fD#bzxX?1k|+3r3<;!&OOjWK*n@<};eS$QWE?)p2{ zb7D&!<*w_m?Po%VNM|+5w+Zs0&?EKZsKl2R$nRvdd`{;YG?&&LcxQ1D_J2C>Y46bn z!c)vuJAE{feTtttV?N%6Yt_%~<}Wt_F9Xh0%Vi5RcF*(03(XDCo%P|ZGu>D{vRSZS zueK8!SYcqcT_qRFn#f5^Vs)-{;?@W5ODiF-2wsr~rM#ig-1*maqAg(FX-u;`su7yK z-6@oNB@|($o9z7^=7Dn1XC1qH>n`km_lQT*qzJrs->cu=6axj?RMVLy0#Rm~B%(jt zDddZ!(1{1ix=8(WQNqeK2N+SjBjV+QyWoc1M&afCH(|&*J?HOVZ-UHr79P#fd5Tyc z)YvP23La_gx#rtu3vb-;$}QZh2i*n5Umm5Kp^V)o=Zsq!lF`;}91~^hq|bW0mMmY2 zKqXTVy=ON4kv?et202MVojMcO08w zym`Q$T!wN#7iDCFi7$>`=9i!c{&`0UzG7IOz3%u~-NfDKu=x2sNjqXRIP?lx#r$rg*up5;MY|y{kW~#7->vXH zaT2rNlKA?E5#a)KdS`Wy_N`D{zX$BKm}NrPXC*fpgLWW60&ee_)ufQJRss4l6(U0O zcvr2G616V9WcU7x&HDRIUvb>;7WZMq{lReG8r)Cef1K}s{8^lfh|ME#Ue_i^2Im`K z?F4GwEnW|X*D2uoSzOnJ>kaUAFg!nt=MC`rEFNba-iueh-1in1`a0BXb#=tP{|J6( zssgb4A{=up-{dGrr?omFB2R+%x&JYSd)`T;rEz%z?)Vrr#jK;?UB}<~g|c9@@O$%? z;T0$3i2Cl9ot(Z%b78t}%Zwko|7cF8ghdi`Ik_!4eJl`sdbyjB?@7JRbMq7XuskV& zjqpMN%V!Wi4$hqN!8mOv3Qo@@A@Rh|(O)%O;N%YZFAV8UAiTF^@MVGr*lsB+GP9%y zGa8K9O)SqNVp)l2j_vbAeR^B3iZL4^_csvtt81xfyx7(n!bBvxTlql2so)YKp^PbYus>}+71Va9p6aJS9WTP!IV$gf&^F`XjrKoW6veoRSo|aKocfF9!Lh$k z^9XcUwVBE{ptg@MnTH~>rvKH0&A-bXpw=m1ehOInZC0 zc@A)UZSFVpLmk=b?nsE*yA|jk>EZp(919-H@?L|5tJM2Dbwx#rdfvNA;eWB;{{$0u zr6DXX6J8G#?m%mv?OgdYE|2-k4T(-as|MoN(2LXgg4FvVR5qWZ9v4}*U1{?=lS0_K z7q-9~V0IoUHE}Q1d^kWcuex+le4G(}9Y~IeGtYpDSsIQzx;O8qZD9plZ^X)=g+$b- z+qPZjF}?*h0B1^1Qnz06p^`{Z)pGEGquUVdv`<$DBB<>$L4FC;tu9+pESl{07Or$tKd*>ga5X8A1mR`r!JtJe&cc!T$O&>qTpJAhN7lY@=7;Lfg5JFcq-H;0`_ND_d zR8Aw|BINN#%`U6e-r&dLvv$!=l1L@I!NnZF`fiI^}_;vpkhV=}9?f3kHq8idnpwpIbH3Fjo-?kdQaKZf8FaNep+lF*DGr2!T zS&=s7U&7)>zllFZmR=05`hl%K7NzNvGO72A<|k91w}?h*8V~il^l@A#(Ojnf+vKlW zpv?d3O1QfZBn{GDpU4OSZ31mUZt>D6#yi&{erzMQ?b2lDN93^QW7BGD!ih3;I(RO~ z{3gCYHw>L=XpvoTs>YP%D5CMzeV<6P2_my|x{U5aICh-@E!Vr)x&-$8Z=PQo; z-Qqrs*mXAj!T#r4^UqHKJ8qNn{r~Z2H@S#d`zEjJ|KrGP@(u91Tf80&uT#MFv$(Dc z*BktA9qeCz7S9{t^I1I3$_Jzx#Z|whj${5)?Dr3L{n`2vDnEvhDsevyJz)OsI@>ob zaNfB4e3_go*fI8t_HCIoco$eS!!GWKy6eX!it$<_v$rJRC*yduz1NZc=~Z3yjfr)f zGT8-W#doHP54pngGTuR*TG+fIvfESsX&81LU5Ov3eX%WLuZ3zNHvWnVxrB=d9db>F zdsrKEF7cJ?)oH>;RwX7~Ulky?bG+t-)M0Srb8yZR{dn}G%)Iio4Np`wRBnCMJQg(? znCgnR4?-KH--aGu=0O;L>??KWnzW=03iCrwLMWl_TUgsFWaB03i>Z=-v*c^rYU z))jr-zsaAawiC)uSX0kSNUm^3nJYIrGSv3SRrB*` zt!q5mUNVCH-fnZvB3fvp)h+`yRJ}pE#J_bg3}5gmd6PdtP;IrLo?q$82Sw!IdEU)9 z`+mxZOQHS|z}m+X8RMz{k2E{vN!_;9hesfvW6M`Oy5vD`a_T*@y9nsKm=REQ5L?G1 zj4R|f>7dwdYdu+V*j6d&*y3sIIGdbiEbor+-LmKiIt<1VyeV9ApDDf?PwqW++~oP1 zG`~y6_9sNXwnx)kdV&wKUI!YK5^D}EjBLv`5o+T6|R z!m;y4+O}(Fpremnp~|-kYCq633n}Q+!>snrY8CK(pWu_xPuJjXWj!I^^J-|%yvw^}g02X?<23mCr>`&;a{K%}U6 zmpsAoQa)vFRxkK^r&;^zS}v?Vq;vAe#;O1LisOE_xDO-l4~F~J;C>2Ne>XYbIDZ!B zB4X{Eye^z0gYymWx?8*+46jqb^|QFHi||ixfUkq$`B^+~fSv!}eD=RM`!&q80mp=T}#@t;)oWup3w2}_*^8y9>IUOXcxvkTzAE z(UQ?{#R6Q3g==_cYh>iO|xw+-WItq=a=fDV~eaLG88V+B_nPl z%rX8v=V0;c(lJfM5=geS#*7`bf!+y1KA&hcU~qmTa(crCAe=9=?0(&WUuyP49?LrT zZf~XiMB_b};^O=29!i0=S9j!*E3r@9CtM)DAyaCJYPDYEa*okpWUn zXn@lc)$bN`4)J(nK8)1&o0r^8S42{4)3*kUC?-+;6sYY4!d6bi&0&*23u*=LQ@Mzn z?J+|xh`vN0AI6cv{5RTVe;uXrJVJ_$)HRU0tFD{%V1Li4`cvx^T(i^@sQOv#Y;7)S zUex#XxYOCg$Y08T>tOSYkEN*j*>3eM2sLj&y)5emW$J#B87l_pp~?x>#nMX1taf{& zw89uB9}3k~JIe~O&!Y>6JN0}67uqgu&U)KN9CFl|vewM6y7ew%UuoC!I2Ncw91=M6+21jp}7uNYi z!2lhX;eB>yP(WL}^(?mlI-^a>8@*Z&@=clT#RPJx{ZDcY9UwZKA7Sk{dLOJ$vXov= ztfaQ{{H6&*E_n@32k)x_C6%Zu=eir>{C=Ll?zkaxvi@3TzN$EkPF8oi1a88+1$|B8 zxh2>*KAzGRrp^=EoV(nUkVl0Nq>`sfV9(Wk5fcaR!jK%+1MBmRQ0Z!Cp@g?L%Ht?I z=+%gEhpgDQLdPJ~Nqt1*g=iJ5KEPiZzda5dvSYCfZ_5FO0S_ZzdRK#MB5&@T{&NZD zC)mtb23Ep4e{R2_%Xi_)t<|2Q`qfZ8wCz;bqa=8*IDrLR&X6}lY!*-*lVp} z)mZorC6yrqpK51Sv{GdNYj>yraIvbWXH)ot?Vwao*lyfvk@vzND_E!yI)=36hpTQjYM zqEGk0Cx-R~-6rh1Jjc$67#JZXpBT;zeltc1Hj{lfyJV5B_pj((a}r=***O-6pSdtU zwA*a^=c}M!h>M}#Hwi}i^bX~4exh{U4{vziJ4<NI-k}H`~7p zo(0WENODF`e1ZDQgx8Xt}IY7p^zJ*TL}oEOvf8Z?HL^#pCQE z%l6*Zv;DBYHSJG>UkoU>4yZ`F9)MJzmn9Bs#vrkOomTA?%#qtH3I!R*&w=%oJBYca zG0?>72_tS`xtxk^7bi$jp2)&B*39;XSl|->;^l3T zENEG(v*L=>9;@ zy6I?p_$0tm=#H}!s19!Lx?*4td|zD9GKqDEbO-N#xaMXK2%jobSo`0SZp?h+-!eW- z61;BDT2@R0#ZG36_$LWK%$>+^PsTK<_uTDpb!Sa*ZZ6+wdyEPA^F+%;et#wP{h7Be ztAoB7)vm>S#ev)_BGvDf+CF};#|Skr;Hkp=!6@(EW>I}>sO<#pMK@H)I)D!2e8b-} zQdIuzX1gSdJ37Db=|_y$g}wGt^J_SO5SU(!`V({!S2x~0)VkY_XX~5wU|(xH2dQ-m z*y-8|1W#k*M);_ir;i-6yRoS^czfZ$bugh^vCaG}^}6rk0;u~DsOlK&9QLTmgDSWOBCYzz)p-^lZ*fTa%z$@Tj2v@mpJ} zKqtYhP{J<@8;_Rs-^)=brfa^h+Mb1eD;*|TU|jPhc6BqcZcJxD$8P6Si20`UX7y+p z`64CvJ$6=8^=sb~LgF7dAv%lU+ioO=z|~uAt)3VUyk&c-5G!3Otls@s-vzO3&UD})|3EK8>Zrow^! zV`&+snOOhW$?(N7BwFZUj9*MSBpO-_mU>dTW(1qmpV~+?y;P2%bsY?kM}aNNadhpr zkAKTMDa6|=QSe7XB3LecExqem0=#8>R0BJf z%)I)T5_sgtkoh!|IyTP9a)z!vXq$oB#v2-JKKuG}b40!p;`M2ym^&r`dYmBX`YvUI z;Nk58byovm*71&2o(BrhUZu$F!{8d}k^8gAcKKD3gtczNin=qWYeX2L5P zdg3^2H~vN_KNA3-%O4uQUK9!0T@DdN90B0oTvsx-MLAfUkpL=ikiF zV%yEU0Y0C_r#TaFT(IJHe~sO6DrFYV#&g53~x1A zT`jw10Zi}TIq<068}xki_P^?U9vlD81s#mniMbvi3QXSE=j^9DQPmDT$h`}^VGo5X z0rpKNmt(+r*EiuGG5@lW=PyoIe9wdq=K6b`R4*c~MQ1T{sPbB>k zG6_a5J1R*X^VNj`+kFw5Gj6bF-@VbjJxF?epq8|O z1Q~F3Rg+{x=IvBOr${D()lsQe{*q)w7FE9oGlL1SL&5VJoZ#Wm^-Ak@Jy<+-+_e4R zHtPHHjK7`$DmZTlibuyG@JIee5urB6NE)ODQ~Q@u>p8wBzGTL;5-c)Xj+gT~^mp*Jei zA7DV8&rCQm#SEs?#a`F_HbTd@=ME-%Z-waIlS7wP zw4g3`VWB4jkxV7o~qH;}02YL35E;Nc)2);QG zFy8CN3pzX-h|>BQu}@rQA-_<>pzdhB6+_DiaXh_!7%;jgIOm7(T{AUxBc7|kR+BBmV-GZkom9S z_H7(F0Olp@A1S8J_c{%p-KEA`r4sQnEA)JmSDI*76V$NG)$gVcfNE(jHw+Ky!Ev$M zpZTwtp^-wRr=I7X#dMcmIVx$EDA128Hmtb}6&}dPUyLpTWG20HTDE?0xvZ5_@Xa-_ z!=b*`0b7UON3P!9S{@7++hwYTr^4XKp?fh?%kHq9wZd&+SPim<<}>A;U_{cwa%6(L zJV2?yZs(wNVXS{`FUpPnkd4;%SX9r)0N0?a&XaxZ zc5}olh;L&04LbDTgPpcUG2Za9Yh!e!-8ta+9!RhrJWL%Y)7D0SJ-^>)hWCk~0}mX{ zSs!G?$}xg09ovikFxd^iv} z5csZT*dC^<^@QBnr2y)sdl*bTPr<79G~AKu2y$1Cc}Sz=2-2@4m9s_A3`xG?!%B`M zqEnad6}){BhzhNaInl>Ep#s{&N7ANL&=*?c3ty+~5a+C7OZH@cFriX3=*nr2q}_~i zJmRMLKVNa&?-uuA#Qni=-x}Oc0q1<<{8^lfi1WH|jttH>!0T@RR}c2DP65}?;<_$e zZ-B3Z;rUrSZ-CEd@i?pd>Fjf}w=*z36(t;^ARn$Of`f%$u zZ$zu%_qIf~Fyzy8wuJeeP-HooKRQLA7dE{)8RsIU3+CpWrYH0HV6@9n)V4e`q;;Z2 z*|yvq0c##k>ur$;%YgI9Jf{gVXq?zq_Sp^w26*_?SVx1ZUzURYBty`zXJ3f87{I8- zc#GM5XSDs=_2@aaMD+H;vSi4aOUNC*cR5BO`w^}mqc3-X2y~K}NGAMrQ{cHc_X?Ip zEBy9b2>*TdZL86(`wi7!709wI@78LCN6DNW9AKCE3h|ebS_JvJJi7hR9j6cNp+qD3 zBVa#E9EoWoC+^#15mE4P1MOHhAM*WD=%%l@nX6YBR~G}~NB2f2p+^llu~5xV^#_~L z(7$;~zyx_z#Cv@iQA2p{sWnqM-z$&CWvho=5zd|dSr`|Qa?y6^6UOVJ&<>=|4tR=^ z<(jj{CjLYbWt7CO_=NZo%hJA7VD(_6^if%AodP9x{(iW%z$#hyQ1kO!0?g=M2cMkv z8w4bCeRvDD4hEO5N>KB&&};aTL*mySGL6K;syVJFnG9}l&zUXYqjql@qF(bA!wn9tUERRHRRkQ@`hy!^Y=>z#B|H&xlZ>@ zVPas3Zt^+nP->~6te4`W5k9(BQqXqoN61G32tDvLkQG@XwcSK2X-~DKc0c!Sq{e9gJ zAN2CA#=7bor63SGm!?^l!X|7rIit`YDHMsUjD8S|UVVu$3+*;V8nU+Y3teu8_2RO6 z%Hl16zUiXe@s>1rIj%M_^|BhODyCSpu_YK)xO2=lebNY3@L)acV6q>P4R1e~{YIK7 z?6*Zqrfrt|!fNXf)1)|hq>lgC_cM}c!v0s0JLCC?wBq6#0XjC6A~Kzp-0WVmNshf? z_0h-VR=d(Tv#2dF{99J_T22@7XWhoz*^MUh+P#g_K5vxaNI`%+?*tFx!>3bUF&aau zWV=PLR9`>}OR|+)+o?yq@Ym&9t&b%+-J|{SN`pKx_nGmot|S(?bNv}l_7McswQ(g% zKj1)VF31|R@i8NV%lT6QN}tI@Qf&iz{SoELvFwl>nlGg5TV{WsU|gd-{B3pUwtE5j z5o6go~S~6xktgPZjS)k=s!2SgMJ@kwx#OLz~j_}xTV9yQ!)yPoz_^Wc8*2jE^AB3lp4a&{SNbTeHW3B z<91Kq_vRu;Q;+pu z7+$A<>t}IY7p^zJ*TL}oES@*O=d*a64GaDpRIEM&CF)Kav%gPJ z^ZE0whayk{g^DoXun(!S-zWoNKf8)Jz%tr`*s7vkdo(V%!wG7RB zF1jMbpL@g_ChgJo`v#v$qfuyBem$)lv;m0;Pkyp(6N9%`wryd^2?9f&$bm>U24JN8 zjq#j+6gs1$`@ZF*4>DH&hI43ZF!KA=@V@YGdFZ=HM49e>0yll0+6e)9C`gMa7$SQE>_s`KdI&k_I3W?!M{*G1$@o+vtZTOkRZJ|6|O-V@I& zI@Mge6+kMvDhxXr#));P?&y7X1tcp~t3&1KPf|icg8L^QEmX_-Hzl0$JR;)GQ9|`$ zY_}E(d+O$oPCVhdNjr&gc$1H@Q2i9F^}`FatNqYzYtO0tSxRTy#V(ACNbu?%CRFkbIV}*gKZ)kSVOBa-nqf;V4+Z!2sWRRNhRt)8sylbpkAce9WS3C9l zU=yjuo89PPsXae9M%IwLP?Mtn$dyS{2SNId>|irT{t042O+y_+^zc%kFs8+`}B*o1pHxjXxeWYfu!vEv}L3{8Wm%Y z`{rjG0h&)-WqCakibz>*T+1G{KWEgIDhL#lgaossD9t81iB)ZE|#^Gr?I%ms%IM>As;h<4GRHfPiBJ$R{ zqWX(K?rzQ1-ko<~_|-w?sFhIk@{yEB;S-@~OdqrMg^ftWHZImI;)E@F^N4Oi{F_r` zX`kfn-oZb}mFh*^61z?!<-FkK#wiPAZ@A;b4l8x??&)q0t*lUz+2_IXAU!=|=d*2R zN^FP7*T=Bmj7 zEG`{)aJxtOC97{-@jZyVHC@hP+dvF)XQRQrdU+eNcKNACA!15^MXM09Ju`&8sLYn5 zx0%qVePUXQW&}j4BO$Qm;1rRysxrLu;sWU}z39t>4;f%C4SleUGb`Bpc;lY`mv*9* z&W3HB=p=dW%&F*CQZI>(U+?Qi4&;&<%=bJwJjaFB8I1EfUy?*S_xx5B-=dE+y*jLR zet#tLEqrR&ruzyaCmQv^iS9CTr^#Qa(li5cZ&{d`uXIF4YLsPox20ox{;t;t9rV!y z&C)fh@jB?1_%ELCeqI7&)+&caKK-AsIPQ0g`!M4EV7PA$?x%orzH$C6&PBv|T{uSu z=NsU4w|G4mUZ;TTXK`H@t~bEf!SMVno;Sefvv{2C6uTQ#%sB(aMcKBMJ+naXRS7kC z?a4yee>Uv+Q=^Mm?%ayV``f-w(Kp0IvGc!(pCYF%~QQX>EneeU<=NLvfp5Gbs# zHOa$W{JBq5MGv5U9}n5@CqF0K5N?GEYlgwOYimW)WM_2a%o|4Wxt-|6xBvOwYRa!A zxG=_|D~8Xt_RIYtza@U#^sPBFN^`H9JqB%*c}L}ZQ+A)-4Fb8 z{cI+3S28IJIo=~T96LwG_y(kRrki!QU`guFuVQOEBysAvvfL+L)Wnj_xxwlnN|%C( zQ43c|@6$K+1|X#APtf>M0>Tp9`GK3Dk1jp<@N>LO8C{~Y*__YbiqM*-##!l)&BBK& z&ck(i^RVJ#0kl%o;j7>0&t%b}LP?_se~9N(3+#pW)e#%X#!qft|4bUP6PX{RVBE9_ z&oGL01uQ^wBfFj^pit%U!gJ&}^5T-*s^BvXWEaN{4dDaxM4HC!55NUm%50w%Dz|lt za=-K0ZFFXmh7b)JIA(X; zMR$aFmDQCrDx`#@I2sgrd-x*3ekHkoUqvHN%oIFy=K_!_Ci<)13)<*S?%yuLuP{FI zqe6?iW-==OD%RRB#1qIpo&SBr zBLHz%umk5;A|YS&j;?U;9Kg-D{z+8Y1Z9-FHhjB)8@>I~L8RWy35kp7iDHh|f}=fW zV^+|s;BZuvKm9`&biRlD@IHhAWc@@d-D`*A@>A}WpK0|s@DA6={-js#pUa&pLYeBr136Fe7Y@+ z_+vWdOucCqnNzpGFT=!_SS9C9qA%8l4^!!$^01hIuM@UTN+cy@_MSm*8odhI6=bYM zpUjUWCI5B!!b^a~=6!i=v20+ukSCyroeeV3J4-*ReNAo@bI_6RTp%7#bB&4nJWd|B zJ;_edYbGkq=yS(6jZj`&=oh>*@25l_IOqKXJ%-zV9hRTqlS4<)@!OCp5{!Mm6=Tl;{IT`Zw>CJfOEca{w&T##CcsfM+WB`;B~inJs4i6 zfa_;*T^Fu5z}La>{4Aa~z~{4goOM;_A%9q$hMdAzay1WVLDv;0Bd=^%Sn9aD&!1Ku zUhOwM`tq6q()UgKej2MQTJ>^m&{Ew7-K}trYirIM*n3~|@77`oc;~>e?)#cTAfE4L zrtCK@sIqv_l;K1)I%AbWMTB@`d7m>98$ITUt*F}0` z6D9Y@Fq+H~zI>mtPzW_Wy;*k)i!PUiV)bAY?;BsJbqb{92`y8Thw`XENl3%pZhIt$ z_>HPJfICb0=%3BzQw}CQIf~_H;q2K^?RxP7@}Bg+EjIM)#7=gek^(LsL~?ozo#qGv zs=PAq#$Hn>@hQrfk*HjDfs9?3tP$>U5!N(m(#`NtH{cZx03 zqP-PBZvAgdV&Dp?*(c&nf%0y!^Q)sANnnh0K1+?J-K&ADwMTRD%C&Ja1I^59Psoa@ z!mznIAsw_)R#8?lw-Dy1$~9^Dexzu}dwup!nIYM;m@Z-$-=_u<3NEcVx)Fr)9nj2f6+uZX3gH>hoc6RQF z@syp_neE1C0<*M+XG|4fOYz=sWEKe|i0iZF64&9%T4-l{cr;?8vJw8&*9U!_-LBm0 zRtjPx3U~8w(}&4YmL;5;q2O_;r>N3QBwAtCr>B(=id@vzJl-~^hx*knDN_=PfcLGC z565*H;L5pMpP@B^ZNs9$UutyW z5ka%l2a~EO`a;UAH8<3Vle;!i&Vg^G^Q5VE~9gZ@}YFXgqoO=ywHILURU zZD7LgDdp8UmK;U!kX$8HnQBmOJ85sFtwm4VP71Xtd^*ymNhLSr*&CE!T zh~>e(^P^|_$TCwWSPd0>iIzn9=*g@fq>X;kJv|K>n5y!zkxNJfT-T%xoR^`8RgPSj zyys>}UcCCNvFT$ZuaM7U84BWH(z8hD0j5hB4b0bYakm4f4I-s1?>Ydwu9B%hY@WWy z)Jf-XS~3uAukA~WAj` z0pm8h#~q&P2<|J6``zL`jJQ7-?puTVDd3!MoIi_m5piA@&XK|S26){qUJr)XDd759 zT-Sx`4e)g^JU@%)4et`CV`K~V?d-lkYqu^Z4vG;SYHNl{Z`>%3oIi!cb zWdCz)P0VlaEZd*_9E4Q{8gm%?ATE>bd0x-}j1GsfG+cB6(7QIX75x`;jOFBAT9)20%jwiQtlZSsAx|i5gQwF;4>Jqku!7GO~TO3k%AZ3Y* zUFF8y@a5~A(*pUika5=HVqC-?M6ZH-(^tHwwtUm?)-U=oIpy9JMHa!RmzFw&i%7zP^w6~yEv8$)cS8oI$=K8fkNK-b(}s@?#2?pC3$g8>Z( z%82okC*;UcW{LJ}0_s-L`Aq2EH*&vN(0gi}og}A(A5K3H4XpMpbzMy)h3H&3tCoC& zbUPQ`ZXDl&R0#&ra^F0IY9Hj}`bs#4T;CqvD(-t692DncF#Y(IO=#DQMCBw%_Po(#`bWm8gT9c$-T;*Ih`4x z-Dm$y%7H%8*;S?;HpY__4h?IQhm!UbYZsdFX6AID2@5(Uo21#+UgE&U!f<@ z)*FM4giHP#y6I48o%6+@&t9}knk{VxRs#9dPODxaXCPg;kHU6P8}sjYnK2bo4SJos z6^%|hfgxZPx7?T!?d0O#zkTW|U@FoR;16zr zsfUmJ-mp!E0UyP+>Pt=|Gjd<8|Eg#q4-1#x=w#`F^E>oD-s~-hyw^sc>F5<$)2jbP zUeXMq(0@Hs{Ne&?{>5tUDjx}5k7BzA7w*FNu79?W%nFHY-H|s$K`C6@q#Vv4< z1#zysc%PJ;@ac(FfCk`kxY_jmfEBo%U32zh699&Jtk0}|)sQSs-}dx>EKh#bLf>uN z`hcSNaWV@gDgeU9cK*&+y=382MQYQsXJO%zc<1&nCcuWnyk2B^h?xJx`wqj3H|)sy z7TzxGOMYvcCipojjJQSEx#N`j4C#vdo>T8vrzyvjGW&Wg9+LtNdR(G!+8}>aCB$eP z*@nzUC-Q$XfFEvGYxfZZpw`O{FE+z9(g3C`^=lX*Jxolw#Jr`Cba$)4)xW&L zK)kZ+QvK=EK;*%Zv~!CNAUo-}edKKy;J)&h55BSkVv&YZh9SyOFH~7U(IguE73@BC z+SUhbtL2zD(`E|p-x;G1QHn!%TR0dDV?32RX&n(9*D?QXlG$sHy)kI{Nqha09+m(3 zisOE_xDO-l4~F~J;C>1?=NsqG;#@?W*M)OraJ~UvcZ=79;dKhQeiqkt;d%pn9SqOU z;&}snK8wd$b1SAN8TvEuL~_pa#|K5goc`^DqdMV8rF(Auu{$nkm&>8Fc#OBGbLhYM zZ0x@{o6AWNSbz8imVADv@%Q36j4Sl^63eGVq=24pOt@zY2h}qSV@?YhxD#`6yq?Zg8m zxCUDkeV;1z29`6EI_fWTA39@BV%K4QY#=>F@~d?e+B+-(WM!-DWR=u`hFbr6?G-!H zvpT8{%aqpK8!xoLn0ctN#twEvf*cITZrX^ zr&K=$(hHT@T#WNg>NeWs&r;F`=4JB6Cx`=q9d7ajc|=C=B9$Wp0+REtV|)Wjh@BF( z?iRXzC(#(w974`X{B`iC_Cj`sy%fUqv!F!7_XMWvA{CBKQ}qTg5qa>)LbRQ@BPi9l z&n%T_C3ksEX7?6!f^%_mKAYb?_lFv14daTvma+WqXkPpEpGA5U6QjnK+BHc;>8iAL zhpG&kGjveLE5`$!-5PbNy~jyR&F}j6%BF#PTJBPDlt#*()-%QnC3eIM z!qMFI2`s418<7)bx{Jii+vS@=&#NJ)XULB$E$KiPkGAtW!zs!{dF`#!r@i3glv4kn zuPP|5K@2yyO*E3$UyaQT=9GX}Y_kr+T3O(*I>!lqYj?oV)%JVohY{F*?LfyL?|K+> z^Vtg1&ohWWjjH1)vlX2CWkvQO6+w1b*e@e(i%3Um=DkbJg>t5MYg*j-pv0EcnHP+IV^EoKiaGlcQW-Lpq zE(=dpi5ycaSAp8K>h~QokAce!*E@@kWK+(v4eiuFxzwsXwEx6QhAI-HSJp{o0S*Wz zjmMVK-;=M{u-oqvKLPeSuD=l6t_pew&D4_xl~I)Mo{;jO4X7-BJwTtYLcBTakBd|DBVZ@aV6z@#hhBYvv`J&8xZPLwJ`xVkNAr(PuL#quL|cro>3_QDB>`-Vsgd>stW&*FIl zd_IfESz-Q)zoY##@Xl;)N1_QU@`m2*yY|yqG>_X)KwH2cDgT!@NcuOQ&Hfi>wN8v> zJ`wGOMzcSylXrLtBoY>j3p)AxBQNHT_zgyY{;^-Ve48Y z#f41Qe~`2dXV<%f8Bu|hFTV{mb^wzeuOm_+Okj~Uz2QjHS+tG&Z+Nm=3w~{yw>Y#fiCHdey@il)jmDfcn53i(hWGMG_&Cg+c1JWg3?^{MX%II1`e%}Xg zb7ZNphFYfpe^Gj#V)|K_qn5ga>AK)2AcW$5ex4PNA1y!_K1y z0^k>kXZW3pFpwV9EPuwdKzc}9{t=^eK{wTnpzw+Z)}ER2K8i=0Uf}lk0C+J>iEuAlSct0d>feC zN@P|-)1#s+h1Fu+sLH0Mxqo- zJ>mN9wq^;MC39KzyE0$^jmfj+i(zQvpRi*bGJXgb3xD3So(z)x_*;KaLN<&)-5_Vh zQv-Bd;&QWRZ@{_}c^?Tub-=S@Bp_~QIBEPyWGz{X0ZcIu3+{_(B7J>fjux?!U_i)6 z+U`~-QdRwhL6htc3a0FgXKQ!FP?&b{4jYP$l6XdMrnsa%A?{*Z)6f3F3J0F4?N|$y zhs2>#(MGZFMDwThRfmJ+U|rLAmX#GB(R1|d_F)@kaunNM^?X=IBBXA+`7E%M^8B$(9NN?@VB~rI?#wc3oe%1^aL^lV z-kvoTbkmKhn~Ofvi`8XdAnV(`SeyxK-{ZK|j;ep#^cBbbZgC$*+#d}0t-<{iaLzZ* zpT)U|IIj!m$l!bfyzUme?!S7le{~A~^s~6G3)dUq>tJ|(7S9{t^I1I3if;XEkTf?0 zmzKT>9{S-2yaf01YCX0D1fy`2?RqGt=i?rCJEw>7c28y}%OoL@A$r_Xjb2EvV6y1@ zOEIv|zopB#AO_HCF;|ADIDyJz!haP_tf9x_a_;M=P0*lZ7q^uMfoLB4L4o^N9ak&gcjD)dqC)6ouv0~ecG(l^B$c}**)H*gMFm8d=6Jp3Tbw5$1ue_*iPw%PW4=hwY zAPh=UlmVcL$L`m)yX<`!~il&noS@)lWWh&jezeZxSP?FO@$_J$}G5 zVjs1i?Mlb6dI_w*4Fey^)v=ybz5z5@ur8Leq`osComQv}2p=U|9JEH|Mok?Eb1Ub&t$Z?{g(s?-dbyKQ@@oJt8kDMRoN( zO@Ol)9M+J58(936vhaN-L%sj{lAT)A^Xn#$QTY#%&qv)hb-=H^*SDb`_OsPKpI;&E zI7>%6>l6mM1$5YKSw<-DgcyH5P}sn@4TSP5r?|0x66Wm%_E68m*jTrZx^@2Ix`_2d z=|fAxA>`@jZ|@asG46qG@^AWlOxI3md-GTl`}`4>)X6#hnN#vb-&RWP=!t1b>9{2XK{nAfJR}bu-T!pnq%MxLiIPfZlq4_EKv}38-4C z){Tk3e*jdQv^e(9GkQLn>g& z%;VBs`(=^5aBKBadux<9^ZbipuNU&M$}liVkc$78NP6 z%!l1C;dP4Q7qaZPLpyhuIb*tb1ujB?F8a0gPU>Mg9(drJj*QrK1yKF#^L}^T@8sgK ztABL;-;!UrJ(FNiX(N9Ax%}d5%5k{A{zB4rBOmO!FJAevOu+g@x-IhZH`Xtn4_{4L zkbdj7#jls>p@_u}b{QrOkSO|8cZ+WW{DYd{E2Jo z%FVbMkYAsG9FHsd{$h^Gafv)kPPM`AgFw4iArLuDm_GmFiUroLR9g7PAC3CwD~|i! z;y#SHKN#*?gZnAqoNt^zi*pfiUKh@h!TARNS9googZ)>hfa_;*T^Fu5z}La>{4Aa~ zz~{4goQ?RtAerJd1dn{usH7W3072>gbEgd}l(0x+p#CnAI*v8uMOie`S=2i7Qzm-F zAS;M#(iO;t(KDET@rQ98Jwof*Mu6*g(v}mk=HR;PFf+e~J-U5aHFk7E4q;}0@j~;x z2a?s%7z~`_(TMGlX?iPOsB3}5ioI_G@kqeY(*Dp##8dl)dLE=<{Gu_?()4i)HqJZ~ z6EmB7{AXLYV0v^6gw_1s5!+t0lRvQ!{rNI;uXD;DEG`sg*WZ)E`iTXE*ao8N4pMuj z`Yo(oDg2~@2DN=Usl^n1%(P|M=P;E&Lr4uMw5PVWHRkTe_*>Rn)}yKMDYCS;-4n^Z zwdpGk*^SY&nBOglAQf(5_;lYx z*nUD{gx13d=9-vzyGpC;!)rUn#n5YOP4{SZHlV+75-`?)xR`7wySqc!^%J67mFsW(z|PaS z{$LouUN6nrddPBKxYdj;N=9 z(|k^q9b&RqFZ)f8GsXvcc&g!iH7s?s+NYQQ1bhE{InT?`{224+T zYe}AU27fh>Y19zEcJMKfE8Wl5wonIW%Nqsmat0v!R;(Kz)I(6mXtk!NaUqBjqrg9_ zf>1=_PjA1Z%&CIY*Kjk8)J zw0hiEwpdAr%FLyGcYWXo$Q{nj?JPprc@e^$8jhokFZpInDiSbGvo4*4gay*}G{~K+ zK?@k}_@`%m*b+9fYwDY=%o1C3Q>1-O=n=u#%kNArn<(LW%V{%xVL*i(*d+fc1-ss+ zj?Z!%SYL3S+{jPsH;il2b&sJVZ6!8tnI>T9;MUs-VSz9}YBHfEA`Ij+hta=&i}f?) z3)clSD3sa${fC#L#IZhW(4zM{3#xfXHK+W|Jn>tIGE-;RGUZj{9hY67_>f}W^5TSH zH>B-;hI9i@9LBSM^!{(qS?u-5CNg)l(7XqqjUsy0QHH3iyBaXx{P+>>hPAs`y_LEx zeCD7f`Z3aCu=TAsXd?eK8p_VVj>|?`L%h+q2jnw!>O8UQSoxCDKzp2UbVJMp-Hlpw zTX61Sv&15)CTH(a(%B%MD~{*g;yH|XJ{X=`gXbyWoNt^zi*pfiUe|vdng8P( z;J#bj4~F{`aQ!T<>%#R0_&OLqpT*|}ct4BRS+RCv%a)A+NN{uCeas(w-<(`OWl1}t z1U4Q+qMIK|u=~{^+u)2iWQ_F9ue*TE?;!zc=DNVrYI(QEnlAjAJD2)e=o0)Rv5;TB z#}iErkFyPvS4B)M>K@&exr(A;$8C)}^pJ1w4_x#D82ilbFw)N6Xxk|=hacSvN~mS87RXG446k&sG#Y&*`p>!BuEvAr;CpGh9}{=ZGE z6_=y!C+bnBjmGx=U3kVr>uZc{ibH*9kB7DMTB5&W1_yb$tFgz){14s)(H?h-t~`JQ z%t*YwrGEo^oG|esAP^n*B}2OukVDlJzW%cV&R`1MT{Av}jJ!(14rpmi?L^PV2M z{k+&wEVl*@_Bhe<6aaycFmSgW`#uo_6JnqRwk2d6Ci9@?NAe;tUKhx|xyg~i_9M42 z?T?}T|Bq`=Hv8LI1^$e6MKlxADy=PhV_HR@img1UaL) zqkeAI0+Dc(alTN04WvmfR#|G9A!?BV{etU3$lb#~gELfb0K)di^4&v`2v(OG)L;4| zzXf((UDLmSoyXVD2OX{;$6l`9fkeR$SyiGh#=r?BOkuzgv@ET+1 z*YS;0z~zfQtKGE);x&(%j%!UT6jbzeoMZMBacZ}|jG0#|-2VQ?hFa@2?0N~q1_~)e zLRI}&p;p>`A+l3ATMOx~teUvs><7PXA=fX7kiq0+Zv(s+29N&Wyq=?R1R*fklNIK~ zkVm$z43j^xK4Q-Q9Sy0t@3TFmYbb@HdKTA!vOoINij zppkbL+En%ocKc>RA-}2e+z)y1RD4Y1W&f6C~S*5 z1RAu11v!eM0HfN>W#jPxSaJT3vfx1Bu$c{7+Qhr|cR$ z^HQx((Jvng{eG!B-dYAf+`Cl}^{E!R9sZbo1@i&<%dblr+`S7QO=+oH7|CJp|J{G< zlIDo@=N%evcprlmmRxDdM_%l4g50E+8Peiq&aC6s1dc6K)<`nRVBbTvf~wmdh`MpV zvdfbSP&G5&bL)BE=rdmd$B(F+h(pf{2BV- z6qa`j^ujLCau~7aW1C_s$L<3Nb%7C$Zs<_yW}X79J9VA`<9q}9Wba29e-_)1u-48; zyRR3FR-bi2Vd^GF2FAZ+=)(90aP)G}0Oq^J_IoTn%_5BL_xHLqFRlMp71QY8&dV{YAIycwB9DcspoX^t!zP26PjhzRfkF=_d9cSEj`_&m_&_H6l<&zpv z!P>NAIi3xywl7Edd_4*|n*~_z3yRauD@WmJ)dC4-5lmathp&0#^aP0ho{v}9zqZVR(&&FpigW@i!4jDmjXCa<3_w;KqzpbOF(_CY%VN?1?o?w7a$jdFSgxGmG6F>i3f({B#&0H@94 zmOaZvt>3>{Ixjc?zbszk)ADJM(D%ub^i&>cO<4GwubT+$C1cGs-Af?$d*-^qp^kih zQXBTyJ`0q+Pc|p&vLS;Xh%c)YxX{6KdQ2ls=Fp=4<)h+hlNwIU+B8+KguDN6wv zq-nshP#N?0SQJ!Erg8c}(Z$ic{p%{v%fzP{coBH|%pSoP40+js|peKd#bn z0u`wwcakH`k58^rv9Lk23U0W1rdt9^yt?+bUN?k2W$Q1l`qPMOWV80OeR=@T702^# z@f=1x9}Lf}!SfVw&iDW4&;E~#i1WH|jttH>z?;QCoy*M;j1@O7~N=CkaG-ue%Bn@#3*pVP9GHs=LRig$_lF*f zK=|6foa(+sL>JuBM@6j9f(HWvS<`xvP%!VY72nPCu(IPvrsR|%Ag&NxPV4yrb)mA! z(VK0ghAj!s403NsYp(*?kBEJzGCH2yLSM=WW1x@$=k9%wPG+~jn+8cRTVtKc;?0k8>I zEMc5)fbG|ILdX=WV}#cep%%yi=0qB=3v6rHf(>eVZ-vHZhw-WU!3ES_hRK{gm zzh1haEU$`EgK8eo^s~e_|I>B#xqObN=?xgd8aLO$u;*1PIJ-HQ5ZvgZ)muWUce5>; z{7r?0R%b`iOXhSjO%R=~>h!8bAdm1_Nzvu~wDaop5~uY)P31}FO|jzwMwjwvTk`sx zpBVaxlUc1g1VDj?{9Ct2W(>d zWVuud!2$FPrRn8GFf@9j!Rog+;C;k?a)K)rOuA*QX4(3nP7B_J=ui_F&*fyB6fFUS zlHY$#>KswGKgs!5O`AYkyn3I5D~%iGbj|Oz73!01{ma~>2+$A33KiCeAotu1?(EY% zjd<1#bX3J-E1La=hgX+TIJF~bcJ$zVtUs#N`Mp01oBJ$I=V)9)=zjH*CV2|s^%%h+ z?Iujmu%`9be5Mla+O11nY6(L7^s+BBCdZ@k!xF~U_VBDhNhmm= z`?=wgehzTh?X69}tr9#_V3-anbpV!Uqg;XxBm!zvMpt-IDzJNZ%h4ey1Y9p~^vX7L z06S*yHW+`Hr2Ood-`{HI4fUK@u8*LXue)zZ;3&4;(kujo_=>_mIVy4yM|wj+cnf~MbE5LhZnP;DLvg=G(fa!)`@>^Qc! z?`vdG!s26%PA>%I`R22J>r5focIdVG#cTO6Y=xqFyw(?6mobGy)?(;aSM8l&b+P&^ zq7qD+P(mKMJP?0*=RD-Qe&WiIbULsfyl(TsGYYaX4x5OX8i1n)7daCrDoMK>jcdM{ z-X_W#>H-{}DE@OWjRe8lbC=FITmWOuEP{^;^gx=? z88X2s5>?miYrFFnfLn*n{h2hPLBO=Z(A@8I^e9)M#$x7a*s_?N*E2Bn%GTavt)~%GR@~=BTMU}cJeQOt z!~#=Ea$A!o5B--bj_2LtIgEHd7@k{$=P6)+cmLyj{~v!A=OW^~E}SES^9^v{E$#=y zeG0gK7T0y*dINkN44=>9a|67e#p~?WKBK%t9|mCVL;1t#>vKRko|nvbRsqf2D9%`j zx`bTlGj=a*4?=Wb+3dSXkD%FHx9RwQ`oOQI!y-Q!&w-LHg<>9ouE6BP@zbv=&%q<% zN2aF;0pRn}sE&qSDXBQN>u>MkZPEjQynkU=Ka;eteUUtGw@T9C3fZ|sMgSBVKOO#J ziGWjIoMeS!%z>f)0_%yCuh@K1Uc1hj6Rop5I_e`5v>?H zG43)o_%ljc%Seh7V&7+vk58?T<7fAfOZaTy8>!z0WN&?Jf5Ln*+fl@XBd+%uR{&OT zAN0*nex(qQZM`6h4SjkZNvk{BN&2;Sn*3FNiWJSbnJZ3vzQuzP{IvR{}SKK;@{BfET_~Eu;#1*x@HC!g-!_5uG>`b3$5M~ zIIER8vGXIC(sOWN|Gs;GGtCOoKh9z=bV?MCIG0b%=Lv$PzCi2n#v&+t`CbX(E;|ff z_iZ%2zXJ>uBa~=kD(HPdYE%045ctI_CS5hk0CUXTX8o_C!3;4r#dlX7EWD^|lbh#^ z>Q#tb3w(MHE}9w$wNDp9LUjF;vvkMNv6SC6PsAU??K8f9X?da8`DHgN8(lyrw@TjM z-d+eNS-nf+*mRIn$yFb=RJo$(=u?abIC;Z9PN4AW9JBvq~Hn9AtDPYKNLe@q3!AQyMh zD8@ioWAi{IZcYw7tZzT>Voiby@=C?)nGcAmajz07eU>0q@Zh@>O%~Yif!A1P6CFCm zE%AK|LIzTjld{OqKtx>0TOdYwCD0yqGwCuy=ca7lvBlz3!1jL z!7Tc*fy_ayzuRv9wU|W)``%Q!zG&vaeqSbbcIvHCqIPm7innIK^mj@#Igak2Ile05 zPd`5-q;*WUUtOhCT^##T(6k#RKXV9_?Z&tWRXNG!CedIglL*%XzhL-f)plGj%NRWU z_4bvxl?tqRCYim(zla+1dSy@G&K&AVD`aI{>?c)$Iul@xNWiDFmqww42W0*zV69tv z0lraLt8)K!5%Th$b-qJ62)#v)EPc8Vir%c=W>H@)4?btL&S+X)fWs{{&JSP5pd6ni zs&1AzfmpvC9pX>pvFqvVS)SKISI1_YyA)4@^-24k-p{Z(13}r4|8fu#Tgta9f7A|? zq}Hnx>IS1CLn-5c^CZf9v+4fL=UU*mlxbq$kQ!)xGSlThC=BpiaXjx9&tb&#!SLJ~ zJWm1VeB=CCoQsI_x^RvR&Nsk)x40h+_bK4|SzOnJ>kaUAFnm6X&kgW?7O%5z57J2y z9G_qv$4R3H`H_e!J9`sHnj0$1KQK~qDil4NZ2SFpUnnY*-OzD;(g7YeoDs{U`a+c9 zgl@s4A9#8&d1>#fOF-1h#cI6n66^{2_9*uiJBct8cWs28nRL`gO|Q+{ne5cAqPOMBHh2);)Fdu z{JdM$8%f^4(hx3%0RmYgnbqY2RzI4C22zhoGv(|HSIN+VaXFM(lM7JH|_=DfKbSp%+3Xsb!M;O`}ce zp&RRpPqfuA^-OZOijvnk@L9#{=FZM#QUceT-q7r3IB+8B>2T^JXh`QjrkZpeazq%E zecvAgsp4|c)5R4a_57JO5etS*4R}h3`34YJ1i7&@^ zclPVv)GX}${)*n)6OKeVY4y#0xCF=PmROGpok3N*T8$mvDk1bcy{lFms$lSW_i>%Q z`9L-6qT2c~?7p0>>c^W~I@s@3=am&8P%g78~K5x7=tr|s~9p&Hu zJmDw=nan$sPVr#(^(iEZ>l)?z@#e!v18rcIRJ)gQw17wE(E z@#mmZ35EH-d52W>TZ+?%LcQsw!iX#slajlm$tnTpIq#~!3}8g+k2oRv>_3PTYkudNFC0X9 z#N#4e6P?gTo8Ica&#~wS>zlPth?h_Y2Z=wv&(5M^f98Cpu7#mDghcmX1V0#{S+d1Q z&j(@FEx%;D7=iw{)ie6sD+o^BsErpc^F>!2fw=aHtdnvci0H=?c?$B< zh;993T!cIsnRfquXC&7S;<@5@-YuTPi06ahxixs60?zry`Lj3|5$AQ`92uN%fctK7 zKN#*)!1c4Zt_#;2;Ok)cd={S@;QcIKXLm%VoG5D>fnimPlcgPAVCHS;ql9)75S(eV zPe#NTuEbfB<>$lD?+-dBTjs+N7OnEKii=L@j&!;mJZK!Eea|iVe7G8FWF5HNCf|F^ zt^J#F%szN5jK#FVJA3R{{rMx2_5(yBp*1Kn`rt=mVRbbh{kbDZ;be`vn5_odv{udf z{&qR#Oj7y#*Y}-Bh1XP9g3a_3%T>DIdIXWndkP4fxt0wFKLyES~0+HkQ9R|LfjUbtXy`gv4 zaH6u!uv3(?$&~Ix->QWrPZNFVs>?mzXHnh;N|JkmKasY5Y}-rA6;IO)UfUMQg$%p( zx7=&qg%HL+?X3UviulQ~zl4@sqm*m9nWsQbxWH)5uZL7^->v&|z#btCIH%mL7C~Cf zi!WikF4F8>B^pPDO3hunjqwdgO*On*SnMBB$bJc*H1ks_-I-wQv|1ODpsn|;g@_V= zA8x1Vx~Ru}xM_L=$T6`*TL*(=4x%I|{F-w=wkG$R(Mo9+Wh6m?x3bv=2BM^pKqR-txQcoh26LJ#XvgVnK!*>(xag zRw!i73%iP!Fr86A&&ZCn0^(kx`eWiH5>oRb<@66rFLcdE8tvOB z`6N@<13C4=u#wrl9{TVvP&3#q(OmM2FQZDHNL_8VTv5S&c!G)R=2jJbbl=A-=k$u0=V=y|sc z4||}2`t*cwk`0LSzI5PzPab6Lyvn^JBM4FbQl(O+i}4+-dr#Nb1)x2Xoul<><> zf-iIXL$E!6D#oIt0?zPCogdm6g0yjI&sIbSqk@afJ>E^4sJwJdu5z;pD$8*#WA2V0 z@sL>EuBM3vqTs#x_2oZo$Z!n0PCBEBw)xF3*;q;wf12MqzQ^niDcEjBlV$ZR<(j+C z3*n+RqEuU*H%tFca3j8#m7p?3DWvlX|7^~MsJSG39?zBlq?RhWx#I^A#fv&>+#B^I zdz-u>Rs9cCJ7QDAk%koFs{TcX+;SVDzZ%l?c&?3da>?ZGSbdIPm6}`WHRTh~#BYG~m z)Cn;Y@KgS^1wrJL6m;5p4bZANJ)$CFg9a-cZ9DzvDsu1iqHZ$P3|wrx>&WM1ff~Id z-JTV^j}OTZ88*;GA!qKZ|n_ab6eBk-_-} zxbGJCgW)~}TtAEJx^TS#z7B@ZXYsiK-p}H7Hq_ZVrPhB0-pcuYRCU-4?3|gm6k!(y zAsvd&WL0s5uB3F*InfLm3vye~k%>VL&)qkhn-@TKKXZK58JUUp$a5N%GasZZUPNaP zZjqr>sOIc5KjlC%EP3#th5Rufn-i_&A`(E-q&A{eUJoPSO1}^4ASkv8bMF<-!upL{cfAGnjDaTP2$i&x)Qkt9*uP51(U*D!vl@&ib9l!R3IyWWB z-M*el>8%MMNxc~$<_+^K4X)iII@k~Hb*n6(P8JH*N9S_D0JUFyqg$0w^6QV={;==C z@&X=LD>4WmZ>y69Y58D%0_4ryn(lLhNuR&Cqr-#h=r&I~w5dAsWSE~ZvUBDwjf)5g zK{nK814^EPZ!C7#Q5uFf^|Nq~ z_?x4ct_wIEdPk4x4FKWWvBRFv)DX4jD@xpZtPrOe$|P%tWiR z9?w!U5A6s9-E|4K#Q4idwxZ`h?LEW}3qk#4MJy9a-gE9lo*x5he5y&bubhMsim)+S zKc;s!UYmcn(M9~#zw@2grBg`9btW%^;s9i+WcBH(>z)Yt%ki<@g-&S1(}Q3AiGC^|f&Y@$NkEZs0>qE6w zY+dE@4hg0yVN|nhWWn2y0=K1Hh;9Cx4u@t6A0BpgNBtr%jusv9LRBZj;%~Y8A=QN= z{L7)&;ka{$R$pTw^wd>5xO3PUiI9J0KbsMN>Tp?!pXv9(`25zSdrA)hZ>(`hdefuLgaWCx#%vc_=kGhHQTd!(s;)&5eOjii-NXin z8TKx1+RtU-xSRDY7tuYAupcQ zbPe$UK;{to6ns+;xvSbAKH6~>ywrHS{;fhADy$d{JI^^I0>-l3hc5ac;x%p_W6Ng9 zZU@TD3&$8>$~*DC)LjJV3pIOu?GE`bR~*l~#d8?(d@ww>2G3K#Io~*c7Uv@3ye^z0 zgYyk=-!1M3!+i?4eiqkt;d%pn9Son(;&TJMpT+BJ(7u6OMU4@7ra&OvGg=n%cC1gG zzjYD)75eD?AB@X4d70t%SkOiEL*2U_w*(W=OCe*zFFY&|=>yO*CFvw3^1%pK+|W@9 z^}TjvV$?-Sz*IBStCIH=wMf3f7urh{Ld8SZo-b@jlNd)#%6%*k#M1BYuq!iCCRV!N z`*1sT&PGD6mYD}pJ5p=u{!jx^jywA}ePn@Tyyq-y-ElRvsiKU*0^>d9ee9jN)`QJ= zbHlGmNJyiES1ylw8L;)MivtYxj)wsuTKK57>^^ic<<*Y&d;r$YG(|9lz9oh5@Lg7k z`biPBjS%}DzDlX};bGtQ`4#c`4MoR{%4lep5_~*;&sQp8VbnG1lp5-JGedCSx;~a` zo`3d))GwmP#iY#~#<{y!H}k=Ul|ns_hXf-sy9q0bR|8RkafFPYizu?YEN&j-&rxxV4L`4)>icOr1Eaer$Dlh zlcDKn!5?W^JxtdH0$gl*xTW1uLBBr>=&2BdY?Jzp;)Usa?*HcbUapMp@Y}!H&n69M zC(-IG{cTs~U1>L!%{p;=?wk$Sa3z?dmaj$Z{SKqMW4{ABG_BBEE0i=Xny@o2BCNZNv$5 zYXk%*PlO;6)oBppwbcw)-AWY7xz!8Sb^e4F zom$;H`$HWixYSsdq$;2nZaK{JwcnyBGmGZ(fG;G?f^s<#x+G$(`E*Ht(+>1i@8rC* zhbGv;UFn&5X*WVp5;^GMB8z@saoXSg#~rX0K1h1lfP>nbTLZ+-H*&o(G_ z!)|=9`D4msBAZMRXd&J#8gG-UETN?8>V^L{W$ znRC-2*MdyW1pn@$#h*r}|9xUqN_2$Z6XN%W`otiq3nxPLBO_1&M#k+P zY0Y93FLO;xF!8A9~k~@ zR(EfdCvyK>hc4&nB6aPP;!6Li7*MK69`Jt~iUig+NVtgypabV)&xZYSLn_w&&Q+64 z;eIXqkXd6g(4csq`Ftr2xug6}Ugw=Jisy>sdAE2DBc2b2=honP3OMH*=g;C?M4Z=! zb7XM70q(oS{b0CH0oTvsx-MLAfUkq$^I3dufcLX_o&DTOBl)k6 z3+&q&faEv6a=OnFj7sztZ8J3^BaxZn!$qe9&sNeVK4Q0B}b zwiSAS5`O)2rSq3Z*O%Dpg*P%_J7cCezX7ImC$5$x-bT;`e~madERSh;g*EC*a07IM9T)y(^!UG2LUBnDMvegGkPZHjVR5T2`r~@n=bf?>4!J zaP&SMgB(18rd*)YRn(F}cfXI>c@E3Py`jqcXg!u4xxa9p<_9B%Jf73VdckbaJub)|C6tK@p7&Nm2;sZrNe9-6k5~2y(B=m92ctIo*$=V^7p=~w zFywvpB2@tIs_ivfIUNx}jlM%K{~bi=y}v{>R$#di2?ymaF^Z#vJdG4JayRAHryn8~ zq`hdTr$*-YpKj=&lYV)^yFk=9|6bLqr7yb9y zaw)ys5rot~X4do2N`>^(1E*$ne9`;4lAMR+FCtCyNyZXR!H~JayR$;n6J^jayu!1u z5ZHa#B@C7P;KGK@((dcV$nhWXs;}x@5X4&is<^#9Y7&x8CznA6bjtnx{`~&Hmh!$- zC^Hg~E4f+VZyJG|ILCK!PnQpBHnUrkK`apVH@1@uD-%FHqpU+4e;Keciuji`?1aQw z!Tb9j*JyCJ$wB6w{W8@)s!}#CTJYs>4{{+*55JHG`rJwV}e5zLq|Ba4{ zVEXS`g4&h>V#SNnf#C2@RN}1|=fIR=%8J>>t&NZS5v`&o-RukvsGufSwT^K+3m-le zb^_K&^*eU+GlHIwG!)wikDNs9{Py42Fx;S)9U=_smoS3qWtkiGvrR;m8^u?9qBo)wZodE7gM$KR9aTxq5^ttesaY{_&6-0KD3pq01VQkFJj1D~Wm;b0VLNxXo zvZnv>k`&gg1lA5lcVGVf80LIm=%Evq3k^0*A&fCAXfiY4sBfM<^FexE7l${y~ z1)H|!9{G0yepKw*!Z&di-NlVe{QeUF1svqsKAAYe;NY2ZQKQSKJI{4zdy!j5>IRUrJz?;QCoy*M;j1@O3bJK8w!{@O~Drv&D>sRYS%ja79TnC4+rGRATRp5cq5l8V$5k z6DO?U(?ynI?HFUo$(L5f#w88~owoP2=7~d3{!edh^VOlOU@^jD=nR?ZvW%TG5}}5& zU*7b)G{`J&v-Q@mbMWgE2jWGCdFt5ZCob+2Y)FzswR-36WH9(d{0=j5h1%_Jw5vW#5HSP~M49RXT1y59MQM(sL@2X@j1F_CnC-Hs+b~_%NV=Xa9RfE^t z{}lLxjj~_;cji|p_H*LrN0CmC{Ik+1zpYw z)ARzT_Bs_g&bZ4kuQ$RmK##xl##r zZu10kon-=DA2VE>_O6oJ7XCR0I}B2dA2*#4sE;Q_aR1~8&2xbM>Yuyr95RE8Yw7_Z zeppWFHgzQ+=LAzJNrQno*I>q;bEl4eLXekN$Bd=a9>Pb?$yRO}iI7viUExsM+xq2eE8bE{vK z!1s=ed1J300fq|edGZ~Z_- zj9Ygd+Z9;R+b_7|-wZX;gGf4CU;wB7mbgstyTiGm#)b<4?(q1fBfP>Izo~v}pgn&g zfmq%)QTj{yC+Tu>Ckr2+6)aUurnLIc6a7Zwu46aqz%i_0QBwzte7~z3A5<)la@eRn zZN>O#8ZCC_JDuu@?+lg3hL@jE1XX;cRmeZ7i+;*I8?F06^T2q-V%H8x;JH-I&G?dd zJh`WK_o5)O?3_~d4a-d(yl%+zIyDkRCk`DYw)w+1HxK=k_8uk5ES$bb{@VkJvRfHWGwLGCj#dy?&%EOucZbAdYy<%ucXoi zoMEQZU8U|WJ<7QL%wIcpmBfzbGm3S zdN`zM#&E$3)LL+N>59k0!Mhw0tqu`rVW6|~(>XuD%vCufwh{-oDJbkQuF*ob7}k1L zk-R|5^>cfJxx=87v!R49k2>Nd^KPGSGZg(%`P=`ktvoWE`Dv-;>Ivv0 zQhczm`8@F8_MXullZTIb6VJ>vTtwSNx{UhyE~8VQ?4&wt%>X~iLDuMFEKp}S+wvvX z8%ky*=qqyh|CcL{=iTBtjCej6o?C$-5g0lp4~&u8(u0p8Ezb#|!AWzpc&2<+f_JG%54fU6$&53PB6fG`(QL*acJ*i9E+ zBqQNRioL+9<5#*G*_8Rz2EjnCFK5F>N7p2Rf!+ps0@$3m&oLCfiyZR>pf*# zA+J*;_h&3;KbJ>3MSBUA6|tfi6pebKS;I%qS(7lK1CN`qr9-jfJw?NL{v=n!&a18=LFB@!3a#6!!wy^!#~97fHj|2Ff% zWb~aSnKIeY0Y0V3q&{J^c#BcztKg%ksVmbXj6X|S9@yj}LXEzcUKp>7w3>|2I5ODt z?@6C##pby!=aSS{4N=0$_4jqGHt59i@J`I90JC#8^|P=`3ru3VE-27GY=Y?x0Acn@ z>_O#UL^b6UwJi~1$cbzHUb`lY5W&Zm{7WOp5#M&lAX=RbF?4>^w66jblyjv8Ff~xa za#xL|_p`wkzKdriia(GZp6#6h-S;7TPfpf5zf7q1S$%lxttjZ&{keRCdKy;D$Ojn}k?cn(b4vEgdTAyu?w73euzIQgO7q`HaN&O9bp~swaK@hxY$vW&r7t%#P83g$d+0LB_C$Pa-Y(6H>0ty(Yx`v<1CJ>MRKYHig1zMF`xcfDCNVU+a-?XvUB zytDCWVxpwpX?q`VHh(#5yX9%{T!TFz^Y9X-(~ zChelM5KAz6p|fU0%p55cnLAV4tA>)>?%N(JvjS18^3`uf4nr6YzOYw>0|rA4wyAk% zM0D#>^53H=sFz9Cn=Ok*P+s#<%yTnWFtqY{5AQ}S2z?;QCoy*M;j1@O3bJK8w!{ z@O~DrvlX#A5}CFm(EQyysgI^M;If{Qc21iWsQzh|;X-dr`jDud74z@}$!t$8e=c)6 zN#*>+)^mMSQk;W(n_9H1lOunwshWSyk17Z0QQ>!nc5|m{Jb3E2Tog zoAnR%>TfZhMYAyy+m+W-Y#7%%*XaY`JFg6vi2`fYU2?$5!%Icupa&|+lDinT)eNqk z5!!h=PYX1el-g>mJD?rv?e)46M)2!bAM5yCw~1OU+trSb5D*i01 zW4eG+ZQs%SU@(z=hYjXa09*8fX!=<&vgqZ9>AFY+>I5}roPesyH7l)?52H=o-x4$6 zPUN(J70c6jCnVL*)s@!I+O5kzq}ADZxpBGb^c#>-?7Fpt!gXqrw?>dC+dQe}U#A9( z`xQ_vkTkY+AQ%vBMh}&7+ki}6&w0K|EzqAl`dpMV8Pr}-NJoJ>+Wh?6I~Lwr;G9ml zTp*GPsy^=HXDr2Z2|rU)te%tu(aTZK^_xzEg20*jaS0Q2K&1PvAT}SkZV{}Xe$EiR zQFMj)<17Q(lOu5{LiY~P7LM}zb?GUbtN$52@wph%z3)G>L&Og89(>gPE&n8X+(W-O zr!ojWyvf7yF*y;gzcb^PR=5bu=1v0m_#_&A4IM3ucSUcrZy}a`#dL&1ffrKLZ$YDR zm-tBD3^?^hTjS7)+o03KDB{|BKJZS^!n)|g>(!yX!!gZT3bdDf{_ehh4nHZ!imd|K% zZlc6X)@9A!fJ7{c=UQ7wKgIN?cAdg&GY}+jpqFdI4Ek@K=U;DD1a~)fNk5!thXjj9 z-V5vLRDXrx-kBds)L#Qb0yoN@liu>)QmPf<25!R2l@GH`!7KfwKMi4SK;HFzWFM;= zXff}US@E(3cE>unR9&1<2JNhSMdsLFOziQEsgI%H+3L9;d7W;^M#u@TKGy&kq4`2> zW+WK!OnX^e>GMGB2DewI5Bk7Uh3vOqkzGOG>TaEAq60!~{JLLq$QH(2F3n7Cj6g}q z^0ePSH}w5s<5{!#B;^*yjwhn5zhz1 zb8GNC1)TGZ^Jj4`BF^i=IWjok0QcSEelXmpfa_;*T^Fu5z}La>`7Ay+!24Oe&dQv9 z`3DUdflHUA+%tT9ZUYbw@_szMb$$WsSC(wP+Dhjv5mq&EZEvsO(t z)JY?;2O`FM;2w782I-f7sL^fiECbh{lL`gbE)4{Z5I>wVd_MmnliF$4PnR;V6A*MG zj6Wmrje8T1%qXP`m3OO{Ct{#pvpE;Da?Uf ze?l?5YitMw_x>jLTDMa?cNu%$O6;Sg>`MN1nJU`6iWhs|8{FS<=-*Of)Vcrk^h}y#MSkg{~F|*FwFyAfk zDbJ(%!NB1{4CQ;59lYNbLDSDdg36~s&2RHWf^qWSmEaCy+4ly&E2HXYn%zO~u>E_{ zOuEMow7CI%&TT^LXJx~8Gtug-X-w?+4NT9#I$v_epfZo-RPZGF7Ux4iB#vCRsmcJ- z8TJnR!@(fP{)1evxijF~%XIOnZ4GoB)*%bBnxPUPssH?rt8jhHXs{(R9;&%ahuEYA zqIb1#CXyAeL#v>?(<|!3r0R34@gFU8QKyk-qiVtdP+ws5s^upWq`JNITZ9Hbl2CxA zn^zXY2YYyw?jOzo*Tz;OKKfLG>vuyWK5jpY#Qif%x8HRdjl3!nJ!E_eO0eE~_~zzO zI5q2}C@mZhX1|IT)LhtszMS2yDBJ9b{HUAXqMv4qG7nsji>E7s#x)wWz5y1l(w~;&NdiDY%U)5w5^VcLX zQP5irf{3!ig|FrEzEb%YmuwZ!(Sx2lybKAlQ^c}>?&Cat>TrDv{N7@{S zwg@|tarc-)@-|*nSEb@_z;+ifJMMG1$v_Y&9QvwV{Z9-q-pQ^R-uaT)xREwU{5p}8rP(H^JGE8aP5-;el93y;v&5?O8_1D#WgeL`6Q{A7ruu# z-XQ62zn;*1uz^}qvE!XP<36alMf^nHKQn0aZ8cXV&K-LGDNfxz?glRm8wGzyonVCY z&F3th_6XBL`YiGBK_DsLZI_8e!A9396?VQTg!7T#9q9%gpy17wS$sbn1_{K6^fLt^ z@^6Cz35QOApPy#=UI}|b>-XuiA;}jrz zm7gnRcZZiH>F*Op$Y_o1`)G+JK_oKgF>-Rl2{7|Nn#%XF{V!J>&%4EQ81Z~CJhukV zQ@}ajIDZ!BBI3L*oFjwt4RGHr?gztt3b=k2*LC4~1AH9}pU>iR1H7Nb>+J1?sRt_U zBT$LKYWn!7Kh~^f%P?rUtFAoa_%UKz)Ob1k;~xK(UDT?s!Lqk(In)9vcYVo+%uvy_ zaqDR(60Bsi(ozp|BMDykX&wJPo7hqGyBjWGldv(|R z9}=$#uh60U$Dr}+jD{L1Rpbu)tg{KnaUi)S|4RC;5meK#J7s@V2(dNx-NygU3ds1` z8O0@M5v7F~mBWr?6RTq@r&?!c8<{2eIaL&0sd0%36Qhz5BtnV541HfQh1{~|9QRr~ z#dle=*K)p_YI#eIX}NiXbgO}xNk1C%W!yR6T$#dwj>pV()DB}gWu+q)#x`m&Y_tFM$eo-qPP*4&dW^HjCZftK<>WoXZB? z&S1Lni&db{Ni2Eln0Lq-0W5k%ud}tF0xUe;zg8%>7-a@3+b#uF0A^{*m6xk0k>STL zk54H$VcET`X$Fx6fL;C>O?oj9+_}Gw{n`3_AaTC3FhNundEQX!Wbn`lnQ~}KTe)wF zrRA*Y+@n|t?mpv{TWeK@{^7V8IeoDR^kWwChhgPBUHgrJ=ow z@Bfy(qdS_nZD_4;rvxp&D@@*2M&B#SA{@1QlJ?>6l4@e~2zkxUh?VJnRagfb!DX*3 z3fA4f%q0ZAZlm~^H#oAr zWy9X`THTjOtOXTo3`x(fY)}bo%?jn{cWE@J>{;B)^||Tff`wVU@;Gv1@N2$Rz#3KI zeG5P2Z4QvPR^M5+af=|6em09uCu@OYcjc<~)do?VPn`Tzb4iD?S)K3N;m`sy`vO;? zhsZPy;eC41Y3U9i&6K8c4)jy1sst!Kd{q=eSDethC3F}GeK10|hR{dL8>mu+5L>`QY&k}J1!k;DVES1`q8S|E<+5S&w&a&6~ z<_Fd}@s|Ww!P+z1pIW=k1C};x(ZhD*AnMPwF894&z!H8`Hr2TW^u1pEJHxJp+U*Po z+o|V=Ik?Rgw~VQwUF==Ic~`ff*NvN*K0Nfs_KBMgl-=Hd3T?2j@swDg7^zAX%UkPU z8jk}OZ0~%goI1($IpNFz<-ALg6Y_DK+}3fbke1LxIm{O15c_ot2c|6V)iQib% zQqW_LUcM64bGcmsaP+LddyP*A6*-pgVO1-P$QDYEKj%Y{&)3e+S^ShoR~CyCC*E%Z zvOV=bG=#)ZW{uj$HIIJM#@_^pwMTD2E>1|?3aQ(K@-`|BW%AC@rEdw?e0aSAbslz5 z)I$}pk6*ohn#DAezZy0CUX2w+dUZW`pqJ@0W{%4R>+BXF6q#aztiMlh@kv#&1Q?4<_&Z)&%i9v%ij_Q(jr1QD3IHu1H(e zciXWnW?y$@YQ#eDm|mV=b553r`&*~)Y5tSg&h4Mljr{E;7gOm zJnmlF(G6vD3U6b9^=;bJ8kKb_x0)UdHPiPu9tw%;kU$HLLl(?6a0D zP1p9`HOkzQPTrQL#`nY_hE{Vw_Y408GT3XQAVA(nM(-Pi{f^kl2b6o}%xr>Qsobb3 zbcibNRf$&F{IFsSX`_BU0u6B(!MMpWkc{{D+ zkm?1Bi`&elLueKG{)4PZ8Nt)QIqB3V-@FdW&ugNZF~62+`2sY-ET&uF8qH+pJLNe= zX_AjsEN=rAh93VU{IeFQt7{lBg=}tIRQMs=tmo5M{b&8_E4!zfq!U7wUla(cM(lkb zG9UjP`HMY@+AC>Az8YtkV3<}zJLz7l;K!;B^4QW<8wJBDh-YG|dL8>mu+5L>`QY&k}J1!k;DVY{!Zt z=epbLnl%NgbZDNdAaMI=@%@MMAcFT(8Spns83sfp?l&PEn8jm zNM&=$5c64NTV}#kQiT>eA~uxKnZk{VmVM|6hIj#0@}p|D5Ki>g!_3D?oBvW4&PO=L z<})M5^ewjTdb&gr$h|qkT>6e8J9H>~TL&{{?vrgOIyXwWVRp{44YNW56wr->ZbuL` z=Y5|QH<+Qe+qqZ{${Yma+|Bn!VI3~dP>@_aNe1~=sQs#XN)-_ccGE76-H)b+fAA@J z!U4)8qpP}}3Zr3cS?)2b+z5NDLA6rOHYD`ONuFDSEU1z;d)@GhDf*x#IjTkJ7bQr3 zlZjdM0<9r^MowGi7i~HzbB;e<9mx>fe+BBIQoKtQe?q-J`Yk9|1a+k7(db=9PH$s1>=fVZ+ z;3-mK+0La{KRIOGcb4+IPszeDr;LJfzLAfoH1An|_YZBZSnMc{|1r{M)4}u7jz{)0 z@+Dcv3mI{|?==&%`1_^5@0%GG;l4nWPysXvV$;d zulndw#Y@smT7wVkQx1tflcq{7{x?+{CMYNGtyjt}k)>@@wbQ?{bAdirndNZiSO6OP z;o?^CeUjF<(W%bCHV%-wxMu7Q38`rM?|p&plxf_eCuGA%z1MUkr|poA^gUJX+cLkl zb7eK{oMa2xS`ke9+<5hoWBf&$SVSLF92o)E-@fda?(4v> zE_2$0llqp%y4%Y@M)J?y>TB!3LIllf2VW)7R32neWbyg|Q)5j{ zp_fiuSFru6={=-h~u5z5f|~bXT31 z>amMTPR?!TXxR}jwz=gD(>}L6DSxeL2Cg6zM|KP3)3>`JMeOk7Wg1thPEkoc$g2(oj{&>?z?aySyEMSI zQJF$_KQq*QIZr?`VJ)&QYWiJ9ffnLtAi*EGO9M5Qea11)#t)ufu#aGomqS^~YBu|t zdSeSShp%lLv__u1IC|n*n;CXqdaXCqpGM^lp2u}j0r~sVb*R?|*7W4^ZQr1f`NPk7saoO&WeCGwqfzRx*{H=ksQ-as(8zB#y73=c;UDoGN2e}pAIMd&> zU(MATeG%zE!Sej!_;+C{IwYU&-=fKly_K}bW_MPb3RhCpZ%ePL-XC7AN%%9;{n0udV441`%u-(m^~#$SN2MhIOLl6!PS6gTQ}dB*@vkyu zb(7?s4MBTo7oKyiC<&I(ZAUc(j%}`|$h?WtF@97)Kj+TndwhQwWntBF-&Cs?kQexN zU;S4#;I^8`Pe0ZOA~Rf8Z^hMtHzPX*c9>9sdc%k>>&k0#uc3a*s+b$Pu1UnJ@*5qn z2vl1Ji|`;RGOUduV>i)1mpL^%c0C4J{ueB&F3VvCr~JKTOsdgLF@9g>T8Q|+RjpQ) z(M;Q6#Y*Ywh)3mtn(*e*iztsC4U`|d21+G!1V_2#u&*aq^w*^eBaK$6<<*h*P>MrV z_7~$;v^{#vJp2>>#MPUKy=sg!=KPEmSKfx~;&+yCEZmIAnkanS+FuLYPZhM-)|aBJ zGyCNqhBu-@YZVO+B+$WftS=*O5YXGl@Mlq}e|FaMfRD@SesB7#t6{M_6!t>?zb|4T z2NHpU?4ow4`fZ@k(OHEZF{9fIzY>a1r;<&x<gXoG*p)5FSI zK!In<`}cxeG~?M=&H8zJa)KR;?4d2K^cu@$8Qb=Xf*Akv>z(@`W6I^d7Sn6rN?t2&= z?9sd=8IUXnEX8j{rCs?$F|TN-yY*Ft_Wr2ApD(}d$QMd{)8GB7BNMvfgx)Qo!$|0Z z5xO;mo&v%1P4LeWyodx}7r`S#@EZ_yw?sV{QKvxQX9-*vfj1!XU_^YDh#L_8EMaHc ztlh*7M(2Tk&<={k>uK0d6ZWt19MOxk?J}V z3EXEry1wMQ0#Xm{@Nl;(BC-4fO_%cl_8Xhlrg0iW9x8pqJ6_(%*&~HDQ}#CCQr8&; zV-`ht9Y1|OF=vjjNF5wA-K_y)>$2C`4N|d9nN5St@|O{}ikzgDbnjdYu0IXTzRDy+T~yG{c^ud4LrDd@j}(r;dWPyeo$4yci__y> zx5#UTC7jrY^FgCw(mXg%DY)cGJ^}gj0P=Vq=XF9!85Ti7T89ymukp0rb6DS8o;cs0 zd<-F#DHPwSu)@}xW!=H??7=hK44kFxjk%LJ{zqcimywsAm*IJV=Sws{p6zS;a)9o& zO(meoOa`3(^pWFeP%7B@I3{Iq@g~qeOPk$vWh=#_`SI}|aWN_mf0#{N^n~eINqb*g z?AeTxGNsbW_25#XgSHegLD8#R7ggz`muF0WCqTWS#JfY%_n*^V(k%si z<19eQ#OjlH4huBjF)~POToHsw=&cGgY0{`w4~v5Lh>?5D1Qrih`qNH4l^1q$7@!;R zKah2ac~4Q~uTSpP>80D=9-BNWUrAX>SNuB_8Uqf$@%Hb0dLM|+FLtoEw*Xngl-VoS z8^Ma7T+>+6WdN!kSh}Ztr(M&DSsuH54o!^PswX1W4(z)Qb_?-vA(rwAsh+QHqd^m~ zG{+;)AYYANf}hnmt$l=N{n;aTQC}U^p$Ef<5DQ3nTzzp5*2Ztv`sU1{^?W?k zcTdl(ds|)x?6o3WlUYoW9qQ+b>)z{No{oteyH{ywVc>~dc^HkcN?WM@LTP3hh(k!n$D4}i1xW2qQl8bvSS zQTnXHR%BZ1?WiH*0G2#UKS+EWqeI$hTUL(!2sutHb+jZO2qo#A}3pi?Zf$=<}YYCAXPamS1C!4!8eb6W?M4eQw~VZY0ZajL*=)>TB(v4r4>co zy1Hq2n%o{N78ANx9FZ88O%Pc)3kY3tLhqK)VT8y1PallXt@)>?K=6DM{IdivBEi>1 z@W>GS21MO0Q4dDcDG>Ns0@p?04TwA#5uYXE282IL*xBTdisFwsWWZd3Ghf` z&coJx5On`C&XwT~f%oTW(ty1`_NjC_>78t zV4z;MtqFd;I^4hdV_`*|x_K`hKu-TOiRyj?-%qMh*@sx@MVWoogE+6AARgI7Z;bRn zHB|wZNl}c)R8mSkL{L;-&Ft(F^tHh}tkOehlK7=68oH9|BO0 zr8_?@6SlKH%ey}wf!_}$dw5{=yjy9PTpGIM}ZD6~A)^xU7EK<0Dx@|mEU9gY|d z-k=G`p?28iLmT<)*pW>xJ;nggj;VBg&&0<^y)6uJr7V|nN(OEK6?UN=kNZk#{-7X# z!(0*BONk@kC+BbanU$btD?KFC`pAs=LgNJFZEJ37F(BdVQJ{74CAlZl{rCQ`Ai%0F zP5z-CjN09=J}Ys|3XqsOq_%GyB$Ep6hfRcjfb)0u&$g)~pW<_lRPU4qc9XY;_dcMY zFN6&T^&yTqVDs;h>Bvo};*Q53st0ZYr>pgDCv4ilvB{m&E{$D)Q*mqcie5W-yYzS8 zQKSyozE~XHA|r(eyzuklPQ8wr=Q>xOEPM{&e7|WsJ0nr(^N<0bd+6g}$E*{|{cu02 zGW>=|<=p(XcJ*m`&6VgUfL@&?su^iKic{(b2B?x64WQ zVZXs!BJEVjul_^*;WJn4LLJ-FH>%BO$klhXMJk;riKqVbhjV>^BvUxCJm-yY#))i~ zSQ7y6Z||*@rCL}*y=m^W&3%yS(xB&&d>efnRTaRj-i&VMUh>`@)(Y4({^h~yv;W0s ze|G+h8@!{;GW=Qch7Sxo`*d2WXGF0E6diI659??FjemyU9;(}phJ46YKC{~b*5kb3 z(95_CHZO%fPqN$s$BpfD`Q(V?E*$V;YODianI64=W~zk3bs(8|AHZsUNIMiYKLMqN zE}~{D){Olfh$~iM&L8dE+yhb;IRbeesUV)=v9$87Cy{-BI*LPIM}foZd=Fo{zZHFI z!M-Wxy#(yHF>F#A)}IHbncY5k8hKB?wDW^}83=b0)!x$fn{sj?>9le2Zv6Pok_;Uc z67@oq4kq`7^FkD*6TCQHc}@q~iSa$VLci$pqP(3vD67p+^6#ljSC|Kk5TS?x(z!5K zM5DuP$MuJ?VEVyqWA1f6l+YC?^lstt2^~g4AB>?}L+B|GJl_QWEWwLN@OAxmr8Hs%8t2c=W*=bKN74g1W9-## zv9(P7RJ7raYMslEAVzzGA}Us5#a2D|1Lu7<@-w@m1@(mPoT@vLj--^12x)J*gtlmy z*$3Zp0B1K|pT??_CR8be?^agcOISS_Oekf^UCF!PqAl@#g)YlMi$CUTvJOX*7{emK5 zG0P#AH@6kT(P7b4IonznaOi1-Gqv(GxIT6X*Pq76K~iIaIy=xl2}Z`jeo#ThRs%Vf z!_YTDVQmA>yAM)r*>FB+FphHKJf(O)p|tv7M*F+t7I=Sq_Df;-cYRLzRIH*ogHfNk zoEpif8*1Eig@I?6+FZ}TS-v$|mxJSf%3su<2;XQ39MhGmhMK6r((Q(?dU`2f7Nxx0 z64C&gOD6MVYz)wp>Fb>-zK{*Kp0=U#k2y#xG%Y)N8ed0hzgmz0Mqzuk{osulAjZuR zw?^;=V_q_`M=;vuEV55`4VW?ARWV(Wazwv*JpyimmEw0O7d{N z(Ka6D3b6g}hm~Giv1FmkN{7RZfrqFQkE~lDDs03N%ks$s3=}@x>||?-+VecUxl;K6 zsQ3431X*{17@_0ffLkv}a~G8vW_k+Dx!)!ZO*DX-L$RHI7Ob$;+bG9A0Io8Ijw&l zscdoz6aNzR=vPSrI&VJJy6<`gYUsY<(^yk2-p{fOf3G2VOJon7!v4M-5cVE^fPRy{ zqeu#ELmOSTPDC+mf$<9kUk-v0YmZC_FqsJhuRq zw14;~`*jU!BrJ$^mDK^|@_%>(mW6+LFkks%Mtt@_#J{*f%J+Z%tghZaJNx)r_~nP@ z9e{Lo@6fmMgXnVm^;_AYK1hhmj-+=J->pz9d8 z_IAMMAhdgH(}J~Y=O+8CwV(3A z-0Ac~xr;KWp;Z5+`zgziHx>TlX1(zHhMGU(g59n0ax+eD0mrAa&DY0RqOC*4rFJdG zV98uNl2)vQDHXJkwqEFMyZdusu=IB(P z7m%--<%5ArTFXDYRp$0D)sDRxE8ap=qeO0W>g1egi`Ir35GEaP}>ehj~rV>vJi z3E8mlLGD)w+I$Li`1|P;x+D3{u|FwJaGcNq<%a$Q{_#*oMs)03e!=(SO)85$GTo7= zx5lrx#d*5e`B-x8}idG#hE<8DlcHW8b0*g@9tcYjEj~VyQFj2=Rop(m^!Wh@6?x?O zqgM{wo1PkTFhmNemW)uB2s{f4D;sBu*<#Vv%^sGgI{ol<{Mh1k1fS2Z34LcQyK(UN zTb4^;M?lc9ab_u67W$ee`yk}k{Wx<<*VhR0lDDYpzO;h&M{w)L=f|+0RW=P9N?g%? z`RD6i=SHGid0dbC#>ayE@hv;7Tl~$s+4i8&S_h!&|@Jnq$mzA9PR^Uu?hat6<%(^XR#l*VhWyTtjdE zI`Zl3bOGM4z;l)HJf~Z)p2ZqM)5bsa-9m4MZ(=1&+(SQ`#)CAiVxc{g+0pmv( z2j7Z@!}a-*SH_rH&HCu4vm(T$Bq2kg4Ox^+POI@!w5$L?T#23avB=Qgik z7zJ6qXr^DY2Gm(Q4t*Fjp^##0lp9+%BVE^?gSI`yweHjYWJXMD6NS`GVP9tlS7Bv?B9tIkFFY7ABoC2H={3I>< z%wRjMOQE}<-lAG*v}5=s*#3BD_ft3IwP`?bb@&1FPY+@!ot}d7>xGP(g{gz1J15zd z-Edt1s7w5e8UBAra!PZMFNZ|+Q|E9)Na8K$alO9mtq}|zDP2`JVO-zjVl8Hi^X|j( zH{G`m#rv5#ugC2)wfVA>72GyE>9e20EW8DAz7dp_gG&O|!J{=!!+3or5J)rJ59@}& zsQDQj&yMP$)Nm<=ckObAz^ftG4S7>-|UcXd}I?cupUBhO+Efv70`0y|+xd|t@u&K;QXihCfN zLo#S(y%iluGNF+)+CI%F93j6wy>so4fLWbu>V=5_A9^!B_H5$Tl%&KK>qEkTh#+$JfUS!lXe-8r<`%;4kT*_ zvVRlZ#tDY@f3G{m&5KS5#B%aJA_42su+oC#FDP+yd%O8zeQ@vh6a)Q?22`MfJBFF{ zHp-iAz%2ha17+JsKSEnu59_rqa_E)tW6Z@agYpFhK=plGv!-hsN}@g<{0ect4S&?i z**fdM&rLmh*T&=ieDi}B+l{ebq}T4;GdH15aAffzgM)~iO5s|Ad|hN0%&j zB26)pyazUFo(rxlZNS&5X2Z%njl`W9zcI}aah^9kDG{6s40WWqsYjZCcm4A3t8#Zy z(tHP>`z*);u@AF!>~50h3YS>L_I-ovT{e02k0^d0 z@m-47K={V9QXCAf(MIM(WYH(vz;#;xN!5WbWKy$lh{4^hSRm$hU`5^?>Z#-`1_;aH z{RACqQ0y|>aF1>ZZg$%5OUYm}4tw>o(rGap6oI{dCnp@<^5|peE@HYI6y} zd_~UMal=g*>BxB6@BZ&JYwd8-#l zC0TUm_Be`zs@v_{ia?4PsL3f761QwxBqk)34Jg^x8{F(3jgtZ6a2FTFCxL$MexWF{02nbEm03f z)F}}7SpwHZ;0=g87!jX^^JT;h;Pe^(EMaF2IYYOGJ%V<(U_{?87rrNvs3FXizSzRR zjQGf|w{X4Y?8?Jq@$2>R*IG@4_O`WK&z)g+hhA?#&bW>)Z!|F3En!d|l3{mwiuvpi z>HeCPr{>U3h?OL2ZH@)hR^RS(Cu1N8%jJ z(~*>{t_XM_X#wMzeQ%GT2fwG>k8@#0tK6d)`qOY6D*irrT$g&Mp2Gz{k6flI%&U<| z!q*brkh25)IPX5XRK=+W`Jf?x%plHF3cRjkIDZ~IZ?JQ?i9774CRYmi4d5`Nlv&T> z$2qv>8E&UDd`CYABKGXl?R_H3xP5D{LEhkY>vEZ;BK)0n?@_He@+?%6<;qKSTKA3? zgE-w&^y3+j;(`_O$&)r9=dV=IbOeKo^@dsDx=@`~N3vl5W_F_=;+S5Ss2(3P*ASXSTkgVK=pnY(uZ z)`5^tUxm3#$nP7?CR9^TqmsoG#K|Z3jZ+<)PlKNMDGPKLQ47EVa6;AinJ~cX6@_ z99MJH3olP3J1sJNE!SN*UY?!GK{nWuw8iOd{3Rg6p*$|p_DUeF9pjnF4GSB zV~JfpVsUHhh#->gn&+q!-3H2Xmy*xFt3+WsVe`E#515|~&b`r~29N)Aw3z!G;%9br z#?HD0D32S{HV!9X6P@UTHZ?Mjs^x^vnc{xMu;I+B5&Oms52lO{6c-Ubl zHkaXb3Y4Q482DL=U$F^}>!K8IXW$LcCzgyn7`lbZh|i*Fu3UKB0Dbf5Q5){hF8*b2 zol5;q8y2dKKeXQzJ)h!kn$PnI`g=%#J$;3yPV^yJ!Iw;M{3PSR`L)Om zV_IiTZ!Nm<8vBEwI}Du^fvRCmWd7~N6Y{H9(F4j(=NAKLw49$We>b@-(5_WpU2j*V zf=caU8`iyAL_gN~daNwuGMzLV9~X7`8(EEAqVc7TEG8b#I+GG~5uHe_8g;obLlJAd zTu>x83a`76RLb)`@OqM1k$g5R&~2h5KCuDq8#SnkXy1WKyoUc42{yp-H#6?ryFAFl zF9EvP#>lLqm)o_Z_^{fJx!bE(oY9f*W*!Chx9FK76_3}4w9zeu{>pD}_Ji}4YbS?6 z-e}VGOEvfK=gU7`aYFCb1`r9vOn)fT+7A>cNOQ z1p+@y;JOIB0g(qI;KT9GdBZEhfzm1-Y)UY~0auD9bCFF+)S=6!ZQ#D?fRJ45{FAo9?Q;4IY2uHb&0lexca#h*}^P*SqMp^@$U3aLQD-^>GK1 z=FZhD<9OVIlpH>K3crqW-2t1iRed=j3a>U=cy8fa+j=GNn_i@6XsHd`IYd}!;m?he z`(3B}F=($ZFg-Fr-Yt3mw2Zufa&%7|47YLw@4N55J?iL-eo05%`8F%y_h)mbk05MM zs!vn4hFj{CnSv35j7u=|r>{T%{1MkhHI`KFEQNY~F;68KI#RH`Y=VO(JfB(?Z>uNl zH_ClM13phkWe&TRTrsUmX`DX~og{bTyiQht{ybd1S^5oVRSf2^_4?Kw5 z<9dg(g^d<;@ZnLkVm`Bw-#QU>Q#}^Z)18dkbH3T4{%@w$Q4v!|2W@&1tJCN%*=6#7{6IM#r4Lr;#tCA9gy_oZcNk z&YRVdYA6Y$g*py7vsMjIwmR52oU$9DzxXoj(K6cv?`!_5b(<%lT_t5+e)Wi)v3K|3 zr7mx@`r|o>kM#or9trD)1R!q;?HA|mPYpEE2B)mI;5L~g^jy>7da9le%n z5eaHK&s65w764h_`%~`zve?!MH97GXOJH^R$H9~lcz+cvFG=>2v5F||#OmmTVfy4NOCe;7K>C6n!G9b#N`Gw>2uI4?UVPx+oNatckj3d^S~K&T{czZ zI{$)5mVp^Y^;^gu6OUSn~S;n9;Sl$~s(`NM8;v?R)`~om6@YD5*>s@d`JtiF1 zgV8VTTEOcR=*&%Dk|2JTlHS}I3~^nQJjK9Ch&Q0z|D4zb^I#~&qvHpR&!VwhUm0-& zW}SBK;+Y-D$nqXz+|Kfq_|+eo&cMh0d|fU-W^Xt4_@#CtePP7P|7>YFzMuM~Ngi{0 zer4s)tSL&RySN)atikt_UaFd5@s&*3f9!nGo)4Q3y*Vd@Wc}X%Bz1}%nbX`=lu`7W z(#OOlcr4WoZ1Qk?1=xEbpVM0RwdbLIQ)}CkVSf5@Xr#why-63!6QTP8xVE3L_HW$ zr$FFm30xO}Hz4w0M11!D#0?n!EMaFiMx|P=GknLmF56;5u+%a`)_xsllqPrY^!JoO z#{Ned5tyxlb<5MSGpITrPut;-?Tq~!I2|wNHUncxG~VAAl29aN?}UD{75e&plA=%aKYWGrh&$u>(xR?-KaxfM zSwy@kaMLaC06=pHXXsBW%G5J-QA6b88G3!NS;6HC-II*>0Q)W&Z^8T1#kXQ3 zeW!4qQuN6g4xB#^{Z__j0(qUlro$>Y-v~&wn!xMeQBG~e46)(eAc1e&`L|!)!Sl0! zwhtL+fGX9h!+UDZgCdgFrtQx}(R6jTYuiqik~LqlYHtn$+U{%A9yZquNwcYd^HpxIU$E)t=P^ zJ{#w*)QYL2&sC&%f4n9D@bQ}!4CD86bsnaO6>rREnrun|=ZCVRzUxP#uT$cJ#II^X zoF_%3FLV=JFWwq)Up6@JBkIE6IIz=$(e1Bt?0^(`R>ttVGwSe^ueHwR2%x_=cB>cV zMD3MiqmA5KQAM_ig-xG2(cR&*xyx?NsGJHHx8L^?w3$y`X!F2BRFdQC&!9Cnh~w6w z^|P;Pfns5KRbp}%KCjWLPG;ooTZ_=kOLQ=F=kEPm-4%@a3STorqPp*P+Vs`|b0u76RA0%fK95Cw50*81iak!%JV*>U+w;D z)suog;Q7rXz)c6kxz|(0->it)17kt+4i?^*`N$OVJgU zk6E1iv(YijJ-HTNYk;ea=d#6a{7mu? zhedi$`pOrs4%d-1Xw|QQZ%Ix=n%MKr%u%@^9Ektje{lo0H}9|SI)viCkKMf}iRpB& zc53llfY&Rdh2E)+@26%t>tae}oa^7*XV_g*o`N)nA8+opCMGK$^Uf;fpWSU1JBqCt zEYiv~`#_mh^i}El;07cG)rU3+N+Mm(OCfw3yh!NToC`UnA{diZ?<_z2DCM)50sGIJ znJ|9wxmv;05YGQV(ta@>_}(!s{UaN}9EWObQXg(X?E|lOu)b@6<05%0l%HXYSD@D% zA4i6zgf23l6fX2ceu;81O|Mhq(RYeHsUlqBNCXJWxXW&pz74q|edK{l8Vhp1-BCg> zN)0nGn_7Jy%8tqRzBoMcVLPhDLTS06Q$de;vGlE#myz$3pE}L3BMe<}xC{)vTj)kH zbQlSJFhaM6&{H6Iz6t(Wf)|nC>mqn$2z~>i?v|(rBkB|g{4AUw1J?z&Bph#mZ}B`B z91kNt`~Twx|NL3}cihe<+iI>XDKYHs+|(i?|3m7TbjPA&FL94?Gwd$)aSR@Jg$Ae8 z0-KJyv+k1lKf60qZHWwY$p1|edZN7xp)juBey=hD`hP;l%>&)zVZVqcu02WE zhx2!io>A(dzdN|2ZRdv|{Q5j_5Q>B2BN_2(;q6FN*LB(0%)W4kUC-Uni~01dwVLY~ z{|})zhwyf!6B95#06%J{?J)7M1_Q6QQ|Jd9lg}q+_ku6S-*~vFgrpXCQy9;8P;Yi^PeFFJDLOZ6p8z$hNh8!4J2Zhwb)>!b41OM}qdT5|Bazla z9wz)WrXn%;ceNC$Xe6(^1=s6C_j|PBI#S^6O;23k1g!hYg!As>{a=>6XT;Cu%y}5` z5B|5?asQjt^KNGf3_pRy-Kc;aZErq#Bl`*M zWpK!J;Va04F{hTCaJUV&E4UW%oC}lCt(8!gH2|=i&;$QxjQ$x~8<3I1l8;oAlJWi2 zXKBV*y!`0Z`w(YSlt( zzpqS_=X)UDe~jgZ9Amt5957x-YVI4^eE^y1(L7|%sSL*N{n_=z(g;22k+d;l9v_ES ziywU}Y}e$PTEU5t?wvoIv|9lBN8)C&taCIg3w@CS2|Qjxb$Thq_@9t8;!wu%sXtC2 zLTd+vUzBSi@3dUh?#J8$M>DhxU&i47S87R~z7o=3Z6Wb`7Qpy%d0nvn8N8ob(1`#f zKASq2G}VTWw^H_-6XsF$9;;8i2Mm?|2&*0_KobL>-TD@;j655>_KN-S35?$J-I(&j z8leh>OUMm917EW5klJ&+J0qv zp|vG9nmk`-qSW{A0xq=T*Yy%*uK~<6lz-1-=XHg7gb3la7aS0^^*%Vi0eY^FQFn`; zm|)a{q0G&UItBE11p_~e8hvBny3p_|47>qqd@1x1o(GHS<=Aa{-U9oS|1WM3Vcx&7 z5BjqZ55mbS=ZZD9`1q)AqM=v4KQ_6I{lfh~Av<104BRI@B2~1ZR+pb`ZwXPr=SAY) zr;gZt&ajLUzm2}iWQ%kTJb=lv1|khcc35ib(Y=<9X=u=+*5Wm%cL0sCx-lkI5oljU zq&|nh=jp=5*pThs7}J`7{xq3-bhck`Bsho_9HENt&rXrT*PV21n(^Fd-tbrllhM=r zsrRiNH8m9~t91KCzrem~BGR)TSfBAO;(x}5BqStw%=qeI)43~o`)TeNM~hy+Yd{uS zlsNb7riCEbeOz6UGfxJHrG8#E)n&oQ-}+He5U#V@IZYK^xHYmW4mgWFJbiqFj&n0w zZ|8pW*Sg}1*xj)1#VTro zU!St}!9=)jq?4jTQHWFR$!%iu2siMLyh;q$_e_iaN4Zs)u!kEPjTCHuBK)C)~Ur|OSP zTHH1Qg4r{sBaq+7J+gc_a#;tzj->onM-1PlO6|w{3;l)9gO7=^2d$6$fLoWkR${0o zsMIS>&F`6RVDp$ZLw_1?|GR|`?`N<4$_?LUD2y=t+=8W0Jw&cA1LxgGwLa~``Je%X ze>=`o3S4=}IDZ~0)gOWLI)T9KvDmN#WmMB-J`4CqqB$HFu-EU3PVQGNl0NlymqJ!)tIrM1x>F3%=!SI$;cqNJxvA71<`gx! z9@H}?5fpqK)W%#$m|MX?mXBkXP|w@N37evGfUV~*6VIOzpMmBdFUZ}Ck?Ql6UB?eW ze`k@s+ky%E{@D6AkEJ>K`mT!Pp1csCQGV;%y9qCpt1a^L=xY;jB6Cj}+p}9>r^2y~ z8njx#QS<3DU)D|V)o~^7o<;)5zOl*XhCwZ`P)TXE9yZ5(VQVbS6=6P3_qtJX`LN5^uxu2yuStHwVZOW7jVOni@Z#JK_@Y{ zMd8v^-fDEu=UBOOG#Dp za9H?(-hd?TuYFbD%?6(nxejOZSK)h%P*m^IqB`C z;CNz~ASiajxFYS8y<2UHAQX-pC0Ek z)I`{SWVjC1w$aU7`J8ma;=!?dcV+pbVcb`3we!YqJYRD)vX;b*%pC7^QV=1dpZzJi z)wG*H{lgBo#;1qz^EOYq;OhwA_p8oeZ&Z`-T<>r~U-e(y(@+e++qF-&oHW!0{u}xX z6MD4~J@;k$Id%u+uYMo<-sz2)5@sI1EufqJv?ueZSavMfG&B96jM`1N?DdwF3*$iH zVMRpj597ywpZtLFz8m<2&m2?vR#CTyMGpNlZF{xiQ8kzklhHQZyB+*bSNuP{TSAAC z&<7)QYY06Bg6I3!fBdrqFCxL$MexWF{02nbEm03f)F}}7SpwJffA9v3JQxw5{l9So z+@B@vtis~zm*VbMjO%Ex8IJuuyjyDXr4ZB<4XaPAdkSXZbp%EMn9jJIaQroYxGqA` zi3Ub+U4-^%W%wXT2Ut_aRkol8Miu4eh&KprKESa~^565Ynq3)Rzvi=>!Vu97*?qgs zgMdX;^;eVPD4^zWWp&%#czm5nV+SrUt|LdUHzL3Ng>CM;L~w>}dp-B!DIia&%yCb; zg14itVL6JolR&~5I4#NZgI>r||MP)?TP8p`_mrh0b11rrxu42YY=K5wn*LHQRAdE>_&=Fhwa5H%~ z)||HY{&d|0d3XCKC-avfAm1#>Dm&O8UHCd6-CCfB&#T$|hY-Gw85~omu{2v*Q{9~f zpzq=LA#3||bc^f88>b3j9HY75SeSwuoNs>lr#VfGM2(mK&S`?3iO=47bX^!2yzY7O z=1VHtQo_O8dEFZDa`t8ZnxX)w$s&vL2anJNtyqV1d!C`Xe!E)*cC@3l`p#{R_iNCg zi-$tvtm@H>qrt0R*SH|g`VCWg=VBzt`zlJG|8ONNEpX!Au%d2K7F-w24d*&w9!-ryWV1Go zyB<^Ylyt@9+HY?Ke%Bgnu{YBpg-8R?#9z=jQg%2qTt_q+k=^K>}$Ad^!=L@2Hqgl z>3A>>ss5jU{6SgK^ax0CKkIb#K`Gp|_W|JmK4E-wtXvoQCM1ztM# zb4^eqoCkEMdeX5SYyNy%BP4{rKQ}%yKOYZ16bBU=|4~Dz`ZMgI9BN4AK&(v11s&}6 z#`HVi^w~kC9Q^_-dkK86*i4fuu!Q57lI;Es{dpm0r&lX>*g)%?)Sa)Apl4v&aYsFT z|G+G{L6#kzvJ=Q{}VT0__O$a+|IUo*my?08DOlV z+BX}7VQ0gGu(<@}&9lQ@APQdB$ZSnaQzZGS7OOSr6`Z5`o^XThvBR9*xLs=8Wix}n z|2s`}8=l8}7y7Q;v$sJHi9LM$bJ&4#T+VddPLrsYm_uPZxMDmg7IL^Pd{`(1{q3%o zLi$2Q@%>jfCNX|5=*Hv3&@&m!MCP<2Ka?05qt25j4h(z|MWe)QCO1|GGD+Ob4|Jl~z7-Unx&WVunv!Gr~aN)_cE2^?+dr=40SK`Mt8cf35k*cl# z-Otzl#cN5DwNzwE$Jd)u=!M2rD&qRnVBx(eu8Rsdoo#WwKK#7)(#tOR_|&c^I^)MF zsG4BJVW`XN{PFJ`ibundf(^YERwayh>D;F$3XJo4d--7Va6R)!sTk*ZMTJ|^>o5u1 z_LtJEPJr5`v~oQ3FN_;d&-vWhj}1e}pP5a+_InTq8MUrWfWl{Ifl$P*3!kq}(7wnBQ-t$)u=K@^0UINQkhRGRKBUv>Xq(NY zrq|m4kE$yVq^f!UCsDFwiAX9fN~A0)+|wO9l{IUY>=MboBMORJLVH|O@A_x=9-Gw$4(IWu$4dG0gM^7)W)MHn`*z?z)b$#6IF{(Rq* z6zGf>I-f!36Cc=m&|zc?cOC?VU!jWRI;xe8Un>!e$%CB>q|9(Er6a=P zx)gCiEc)q7YeRCpvadsb?dHA7VZ>*0y)cFFStdU9^Qb8?jIerN@Lz=<%&Gi7ZA-Gf zA1^S*wVh4h2JkeZ>#TXMmBfCJvElSE!Z%Cvmiq7ePL?|s;kriW**TwCVy-idf3ax7Sfu^(UL^T*fgS2HXwaSvNsTm?{ntS?XsN5vN} z&coz&Wme{d;4fq^)ytv0k(7lI7Pwt8D3g1BU^6m=U@rFxGqr+H`!2tj2}_g*)+55d z8kG(ApY2c!$H#-&xQJj(jg8j@@;|Y0WWdRnYLqo9R4f>^a%#nUD3_GpLJQziAe*0@T;881J>$Bwi8_MRGQjcYi zSaCl9hc$E!1QeXv>w1uX1iw}aQ-i-$gBBWpX0?GQ(O1k2nY|rFte#DKYW^z`tb!Me z7M87os}BBTy%4sBn9Gg=yIl1CRvJbhF-Ri3FJ!DNn79gX2)txz#Wk>p+&wx_m$d;LD(DOcjNzgiFfmsk8BzFUfi zk>UrVcxxy=1q$bz!k?vZ5h=Vb3P*;*H=y!vseCXhPl1Y`Me}CIby4vKlphS$v+c8# z-GI_(Q9a6}w8K>$ghJHBd!X_^`t0|4=!^6Xc(%sD1-EU?2&ebkrw#lR%>T&4!FC`@vPe zV?Qt1c#-Wejq3sAK4ZT<(}}x^CVu?CuL7_6e8(4@VgO@#lR7=wP_o_YD`o88w{S6j zGQNn>q|a?lEPrL+;ocbno=2U!r+i)!N{t7b7d+Jhx1;!C|7dFha@+zJdx9)e{Md2+ zn3(iQqHoH4z8~_x-Y^NfUZw2>!tQV6(U1s2xc~E9+P3Xy_tO|$#-5i}hY8u=G=b-@ zy*p1vnph$pNAmT720b^DM+$CPDoXO3K!$OD6~f&IBf2|CJZPx6{2F2VEg6_qHznj4 zTmm$Hy?1dQbzq&`zFba$8z5k*lW;6{3JPmo`#NbNMpQiB5Li~53}yuu|2gD^^bHI< zwMkqX*NewZ_qLV++YLX`KqcE>fn_NBURA!qV*Y;7*6d7E;;S-uu(6+6)% zHx>fMik&4td+Z3Ttp7yPA@rUx1k0Az&~Z_<-%~1Ef}w(i6dxQ&g)f!#g1OAnLG{G7 zwK#ShuB@F+mhQX?-!qr?+ZETsKTl)=f*eX=-sw2Im4#RQ!GZj0sqU|1Z zsFpcr@9af)nBSh=< z`^GxMx4<2H>$_vy8)5vdyE+anNWVtMS?_~LzpUyF{(RLJwLcqp;}_yhDhabZQIQ7p zwn$uyuc`!w(f7IDzPkXw9$yhwI}iEq18FA_{w%ApA&SIBWIazlvm4=c0Ry-w0pZAi zvu9&!5WWFSpNJ*%ZXr+N0Wu#9F7e+eNl%;fmoh8zN5JVI{jy61x0{A=h&8qnCVp=vaeMBYsN*JrBDM9u}6y1u`#} zj}4fD;@iIGCIhly1Gfn+bJl^}M=Zkb7}^&MODH)&R7&6aRkhU*%#JA7%th3|5hdyB zk2wMGZrL0jr9B zCFiaal$v`qw4#T2gjyZ>e2n_U!Pe)OP=n;irr60yFSuiW!U#8 z=Zq6>|0i+S9C*Sw;kP~Yc|Op^h)=OOI~0wFHuevSpC#L?xZ+Umico3faY)>HuNU0s z4yw{8zuH_%A=@=r&q$N)PST2yeu2zb(}*pl*Su5JbiF1lvYTi3%+VESZScCrIiU-- zR&{+E^;9G8Bc^d_57{1Li0)*U4KbRy;ofo3Y#jhYvjz9l9SvZPnXUHLt}sw^T6(ef zoull2uLpLJ*TXjY9Aftq)%+{B`{x-G!h51fUQ}@AhX={m2e$5bNb*R*^##KPh~ETG z8Ga#g_kr95?-7^ubFd;d^^gKz8D!1mrftnEhiV6!6F5vSLZh0Gbtig!z;JL0-$bA$ z@q6^R&ej*nFhiQiKWPz+*4+^B=&>^)^i62&LR|rTo^@R})ioCOlKP@a1kup(N=bit z1yt2=Q2!pZho}!(){-NfnKcs&=3b1wyBgv)cJlPO2r*)3hdz+KqbZZl1Oy;I9 zmpq|ezG%R`WBx!~#i8mReKpA0W}Yg4*%r2Td=wQkWCG)lIf=CycR{w>iss)H4S?JB z^mtNfH5fkpefEZ3Jm|RM|2xFa1^4oiOC6ej24=@rPjA1UM9z;p-QpmAvt{G*!N`m7 zyiNf3ypepUg4X-$m@XcgyzT6ZdqR?x6{1fQZApa@sB-d-9%}Ix8!(P&*3`gU?7W z7nF~-5wWiX{^y@ zDu+uBl(X{`cJrLf4z4#r?HD!|ueK5e9moHPH)vq-b)eq?^e52nNAz|7XE)f?>raew zPyGRXk;IGqb)hoew;|)`uRmgnX@u@gr42fLGT_4&k?ECR4#Pg##^2KoGWc$dZm&sZYK?!~GZwq>u#@^uqO9*E||4iV4l>+;S;KljNt8_GAX@ujvU&obz-+S4o z`G%yOhjCN?3HyC7c=Jkx+=uBxK_yhaZff7*jW1fGCg|g;11tO77iK%|0IQGpa@};4 z2NbV3#dk~bFjD+r6mJd1r$FI+Q~0wKE+U23MeaX|BSV%Xz5$hYi$1dR!Tu*tfgL|f zwj<-Z{x{x$^n+3MS;}re>9drajU3vasBU^6s$F!Lj|m;bQ_SvYiiv5$`CYY^PnVgZ z{g}PDG=Pk24XA&AjNF&8b6>VN5XboYCda1DfR$$}>mlzB)DC0l?AVXmp}>I=7xuiL zS6dMRfxL(FMgu^+MBB*O4e0!a{-!4x?C-HZ4(u{4H4`tGe!nlKLm5c>21=N1lmriQ z?{=-LVxOb@cYgLH#~t|j-hyn;BffeM`I-Bz>3;m$pLhWe4p+E_5$N*ptUai^<=Jk% z#GjlG_TdKG?tn4)$>$se?1UU5`(ctQ{`j6?dg~&!eP9l+>xfOXgts0pZhIvG;CGpE zD-MB!O{knb=?cO1O>@)Qg{`Gi4L0PLi1Kq9$-@w z30ed>kfECg?&H1&XoQu+5hRC>6-N;A-^PnKN{4|D?*|HwYlOqD!yA1$<>YZL;YPYx zoD#7;aGWn-p)6i{s6S3qEF0Xn!FZqN=dkzT5<`QWzhHY7X;&y1N~jLN^GdkZbL|R) z8`n;*;Ou4qMCSa>+?za)cy5btT!Y$)zosY$2V96(Xnkg>am2n;AGY4MZiHSOm_~-36W-L=9`0k52@_+=3|#ve zp#IDA6<7XP;T8|x9P`9Y2m{SSdSdNXctPS4|B$R}(A4fwptJOOfYvEhy>>UbzZbV` zw;0|NWe&qLRon{|^{nnJ50SWtfZ55$>jG)+Y#bSIDz=TpHvn7$ ziXLR%?YoJtRzIr$7*)`EYfS%rvC{c?uwpa#~b9mqdUush}f620v`j_>ALs-ngQDW?XpI z=u6(8k!C-CLTtm=S8*<szxY`UBjdWLcmw@^>;{y5ma-dA`Ya`9Lp7W0IChct1Wc<}(;V$b zx2d%nPK;kKi5(}Sc!e`g>=2FIF$#$5!uP3>07^7u9Zg}VV(M85lKy%+l zOU1qRpfHyIqUNu?>~`3Nzj9UKXBxr#bW$s+fTh>FaV1wi*o}`r7yKa!a=5-fI=FZXJTJ-Z*rdGefv5?QZm zlKmRs4BKvxiK+o&e(ndRlxhU1yI(0Z476R;R=jIB0n1ZThOm$SHj9>q1uIji4QBK&j~ILBT-3wGyA z+x1J3a%XI*xL*L?IBDu@ntvI*3DTRlH>nYFp>;ghNWhoIJBUO-x(N>X14ZTZz4((f z`tQeUHW1_5yB>e$+7ILR>ZG~Yrok^)7Z{`}vE!Jm6n@hLCwA$ z?;G}}!qW!3eb0PxAsExV-rA+-;W9Zvwqz=5_f2G}f&z{iTuM~F z;*HuDhS`to!{44Zkfe==fek)aBAp7-K>jAv{+eI0u)oRq>&~A6@Vgdk$qCDa>I?bw zIBpJ2sU=>}+7kQypt^p!rQX8HS@m_Rm`Z%y15C_9ZVw zTHYO@+bHjrWuSXorSVrC6O#zfaN5O1MEgnTq4-%AL#?v*$~i^U@1(zwlmOmhR`~(> z!B8A#is74E|ia9XGD zde;1(4`z?>yl2%ay?EH#a+GyrUAXId2?lnO?D6*zW?lkDhxpZz>$GeYo>+D_0JlqH?)sxn&qWyxM!SR8BwK zot>!^k+=)Rk$Li`U5!O{z)d@wtxuv-f7Kg$5S}-qva07(hBd=RUz|Ys=a#UxpCZumaYNY9qfjut@|yROp)UxZl%qsP zoz!DJ;(KREze~(%W}-1F4J)3UKZ52n_NUe;0M~d_SagnSq#mhNg0EbsnMgk@)_lJW zwF~?Bj$uhyJ~?`A5O1#Ee$LLEUs{x#lau|8%dweXmc} z2{oJ)pH&y$p1sYnbEJN)RbCQTlN4C)#AnQ{sEYcVOq|^*Nni$NwKLVkaqO2uuA{?S z=B`40zeSs?S)Xqv`pF!5O0Tc(T9;|t!@_9Q6Sf8gOpPCt{!_U*G=_b+`nn2Drl41d zg-yH$%@sc+pW%n(LxaW$x_Lg@uKAp!vE^EZ!q;u&IS$lO1Jm9JL6{^ z6nyWTF9)Z#9Sh!;ngN3j<_-?EY2p5fq6c_%m2rCIGMPFhU3|A&PuZZwHK1?FTPJ%J z*>xFpt>sC^c+5xbG48@39K)iXx9jd8Lf;we`GhpjHwwyv&$w?uQB}t|Wqk&$Jo8PW zRl|pHsxS-6{3c73<+jCL)&4}=_v&+mtpX?N z_00zwCMKPj7J>1JErQo+n5WaSkB5X%oLQ{FrN_>={kLuxHI9q4oBTW^z5%$M>=daj z(apphrJg(}7Q(R+cAf%Y9Bcg^w?+!hGo?1<5)F6y&W<-g@goDnFYb`FO~u3$pS ztSA@ZTpY=>(noeh!G{CEm?^qXDPBt5ONcO+-gnWt_NAB;E3Vq88{+HU|$r3igtfvnuR>!aMyNXscw7eq;;l_b^U$A z;f29Z=`wA*wmS>?0*A?sk%Qb)#M|#O8&6_fzoWk&->Ya8zV}QNk&^XS&eFzD*ZIBbLvht@urIyiWxHgvf;3H)}63AXelhANk#sX!s&n_sjG+vX>n*JAcEOU;v%%za`>{v3sY})BV)o z%%yqB3B2Y|&rSX;*F{@&o$rj%1v&(h>#Od#lluC@st}yigJchBws;X2taCog+>rrk z8@0@p`y4^Y)VAE`mZ4|)#?tJjNc@(wxvBZHtsqeiDdhVa+ z{kmp?H^K!f7|oGYy!Hvd$T=T+Ww>8=j;{>!;Kj9O0r%-NHNMZge%|3@(vrqp9j%Q0P z;@IokUw`S6=fke3%q)9+&DN)}ywwW0ybLhi-W?1zZ0h$4vZ`sAnY2E`JQ%lplXAL1 zI2Ed9@a$Wih4dPyu>370$8fOM@MsZt48+9Orfa!K5?Xw+pv@+bY@@;fBsy7g zU;6;=YJWA{xg7o8mYAjvSd7v(PJef`4Jbl!Tm8;Ko6kVZqm8vzERb+|-sSwtPJzHU z-VnFMLhzZrv+@IaRgjgi;k(ew6L9+unct%a@<9FS*cA>UHn^0>SqZm5eu6eFWns^>06h;TsUSZXs&CKV+O>^nzDy(S2B4`h<=Mg_XJ*>?(r=9xH8cTXF2`q& zd11WOB4LZHZS<%oC$95uq|v7xUoG!?Jjmp%6UmBw&WBgsZEb#=eF6sGy<=QgI><7g zR*ExX)uHj{r^#RBLH1lZ*JVG|iK=tnscUVrp!dPm0_%1U(kpN$%RaaFEUxaP(Qe<3 z%-}Hb4x!#c=4JOsN5keOX+0SiHZE~8fq8|y_3d94u!^zf&^3c8^o|8>(cK{;C=S?N zvy$%-4O0j?FOY=r&uvotK9BFfvGFB|CvuCSH&b6_C*n7`6^W+oh4@(o-l$>; zz87MuW-*wPZ{?Og{GHAWl8`*1cH|Rt!2_#B_V?H0>2hC6uFAwh0bI?^>cs?0e~)PT ziLn=SidUTCyQO#-DSj}Dw}#?Vpm4q^{8{SqdRC0s;S8?_5t-RR__rk6-II5ZLgtjvy=|w0;aJ9U z$xqE3VDGb7!y!H!H2$c!RV`_MV~nqr5JB=VzsvXN9=u0UVCTL)W-y{VTmR0oUC{Fg zBPDUoB9Qs+>6M%&U8D!yx+v)ogxVK`J3I6l=rg3)cP`~Nfh{z zEB)neya6;^DQdsI#2Vzho&VmgcQGtgD+(m}r@_eXkWCK$EW>`;>p>bS@bYD*%SLV| zxa|6>tV{b0!1m_O#LBk4^!3*czDj>Grk>&bd+21eJt7qd3mWVsc z{0P>A8Ww$#w>)BKjO6Z-&I*K+!_aaQmuAq0T0iC1m(H;6r^&}uy9$DIvvDaHgTaId zW3T(mUeQlCw^R?jj|Ll#hF=0DDS&k@_~7P5RGU0}5S@DgcC4ncj%0WvJ`1V$rY_DzcEuK- zilwP=LqU^!K!z#7vF8t(?TuyCj zMr$`~8=K>g&o9dU#BBz;#wW%WOJ9ZsQBjT7C8wcBAE#}$s3-Aa({#d$wSKr{cbUP( zkv;g{%=&e!yDGtqWUe~z$qQg^=?g!X^lEtM>-iDS7a7nbaVVeUyLEES+d=X$9vD0K zn&b!De%pN?$y?L4&a#K(Q?Ry?+)3hmGuA654Iup4HJi_nxQL)OTDu~lp_KOS?Hq|C zLk~PJLgE_$tcK2|7R!epOtULN`CuSaOM=W(K+i*$UEy90+;Pg$0mXFz)9!IH-T*xh zCwzJyY2w>Y3%MivEW9@Nav=YsA9J!<2uO>C(HwXa_((Y$$@MUEqii?b>XW;q()i?U z24A+zoZ59}z)YDEkmkd;doHQw*}edTD?R1iHsTKU4U1gX*nJ7f+oH8W0Z8wby1Ss@ zr3wDqLMQm`9`K2cew2JFE?p6 zyL7mErYh-0QBy9muVlbOtr*-z*K|&Ix}jD2iiE(B-wL zI{qoR(pIcc8GuJ_i}J#Az-RGyj4{Kv^g*%at!LMEv)b2>SFe@ZPkVOFkG|D}I*`4}tXkmNs{{HPKDmP7Iow-(*p|;uPO4 z#luMPgHgOS6rTcx^G)H;Qn-i|UKfQUL*W}xdAC$P7?r0$#m`c4T~xdQ9Q$@%aW&Sz+x;aTj2wrPBP!v6 zH*B@A_)s2?Kd%)7#o4YPs8NnFn{&Uxv`s9P~S4s}_+G&BtuQS-mB=gY+UV^5ou1PJ8tkS=J91aX;3rB4Rsp}vG~Ebne>@NUec zB_~fFjH+>|9Z>b7`@g(1)WlE$$$}<AdUTt`t4Snp z$F6)){XzS9pL_dogEQz{5VOJeiValnOvrJWk_X~5^c(Y?otYY1K5O3D#4s^cuHryF zKiZ*icN32zoZx3)-uy^}2W8vnD7r-D3%#-V+s1339A_9!0G4N zBO;Xr1P6`$7xHT3%cqxU=+x-pPE#$mVfLcLX_rpXvJ+n59rt`7*s*9`N97D{>0|3S=bW5=2M7!_=An}{kimf zAHttyalckqfcmLS?WiFVuZu4FVpA`|kzrwLen=>Ba1-ktev)~&K=V}lR+JCM!m!R! ziQn4^JC-9v@v~q66rVwHU8vq)w~VkRw1d`}B0m^NX7P-C`TS0uubP(s>Vqe9ONu>P zpH*QzUQEi_r+Y>ngphp~>y_3B+FM6U;a{u$K>a7v{_5FFyHfC;U8)Hv#cp^}E&*?=QKG`>Tx zgmRj8(pHuVYOeRV!4&&kK9lu2m%ccs`)PAUAgj%a&(@OnJ!{i9_^M7ZMA;j~th1PMhz{)_2k9Zd%ql-+>RXDKP(wT`%yas-|`?*zr|#u-1P{XfArb-f8?x?`swp+90WJT0%qri$AHK(4clh?NMg3-V zd)aZDof!_vL8xC*f?PJT^A_atOY{fgRe`e=4`!BuBgfPxcD~sPmu7Al5GoG?8s>F8 zU#I3+l26CVUf4N-HQ~iwSu!T9AZ@eLeIG=iPL$~5Q6)Nzc}(zDC2U3cy-ylAdKKtS z6Yt)PH>K4#7>h6$UeIO5I((8hy7&p{74!wWQ;L`o)AWQVB1>T0(u3Vk3TSA40kdlx z#EIIGgpQTgdtu|EA;nYD@6 zdSP}Rc4dvFm!t>6Mdlk$=2`24{s*guhPL@Co zAEt%LI|pLv=oata+X?_?;ePq$dt|?Hy4{LM8;Lj1?|YucLc!Hm9ZzTExAZs=TY8zv zNi0d0v%6820!oW?9_S)DnALjqQ|ieo^xO!piG*Q9`<^dm6ZST^r&+bfXR912@48KC zzhox3#Jx@sgqC%6BLMjECX^*x(8 zhwS#)v5&`h6R6Y>jM`6dN;ypCEci`3d#i`UpQU%U_7%xXpz}0_#U6K=qhUfXJ|G+! zBuCq~8S7|~{V>zbeaxZ#4OdV;7(gFA4Fdhy^$pv1;6}xzWLy_nk9|)FBO0&&^@Hh$ z`s1X1R(F}*+JvfC%(?DQTecpaj~lKlYxm-OOw%jn(m(Rb9X^f^-M25eo0R8WHQP3$ zbDY?GeUlC7x$iVu`S~NQqs_^AEG`{h^6I|g;kbyrPJ>r6X@A2Sa-`YoY7jjpgI`Sk zQ1Wm|Hhq)lj^#zA7Vz3O3ntHxW_pKHl**jtaVFruVKx7?3z4#@EQEFVFq>HPXKpcG%V`g+S9@ zwbI22m6(;wb8A$3iY%%uAS?%YL=6=pLBpDW%{Hr!L5EE7jmm%>7oj11Kb&mO_{Hx` zmMC*@J-alNMDJx(aLHhT^ub zy;YxQ(tB80adDHyXU>rA4Ohk8poRTjdBq`W8(Jsk(={@m>?bII?1t%ASJVXV4I=9? z#bp*m;O501W;WCG=itNrTGl+M5uU-wmn>jruFy#M*xk$=P&v(;Cv=NeQ=#|pYKJkX zYC2Iib(+i%#8Psm70Gd(FzJmrD$g$Mo1s0Ko_Dw+LkNNdIq4pECD0SdOk+8#qxOmeOJe9KB`mLk%W`uRG+bB?+3=A!j8{lT*~nx zrdxr6+7n%T-Qd)ZnrF}vqk{_q~u-qbWj*_W90>3tk>rV(JCm7I4}$lp1v;XtuB zp|+v<-u1>>fX*jv548Mnv1J_B#VQ(sq|$lYLv~l;$|h~)5>r#+fJY94Z#YM{7 zi_2;2oI*kBkrC#_wVW-@2uH3V%Hz)9S<cm}+7B>OQ6;*9a!q1$BMEvm=9`>b@v$?=X3 znBnr_E6Dg+vL4g2?fu*5N6HY1PdxaVxcCT3g|j7wLn1gZ zVz`oWR_x@`2r<$RhQ0SM+vPxDDZzDOYc}E7iRY?@UupPI+wmK^Ce!rO@^TKh=iMZ) z^LoFLEP4I0omu4l!~ERGREX09$|bVeA6fg)mf1EAePq4Z`*q$dvQ96F{B~<`?s`J* z=uYtrdmf@Y^wB|8Oci%GJ)4-YVuH32U!_ajM6Culwx=|Ylo)-_q`*Ql|qXkkoBXWhIeR7~ys@NFI+DeDs`K$nR z$rHS>#=D5=Wf9SF`Wm>#qUf_@4~=lYTUB~=VR3?SSM2E0xrfXx7i0(9mIM6b1>F}_ zqdWfP6{q-a(fZgtjHqPugQ1emTZ87s=2Q3&=ld`IEc%{}i}*iyUHeEJ8C1{4H~8PY zTQVOE)njCy!vDn2vg5j_c!PiaVE@=>DZ2rs&r)(W%JMs*K^uZ-xZ9pp$Jyil+T%i& zT9=M7h~$6d>`vpizl-NRBik`--yLJOtK3IE*BH|OLWmY+NxuguKW8T^;OImlj&G4?L4aw7ZjBK4Kboo>-9W1eqi3_xd%DwjUY?YY4mRM%b!KcAi>b1{zf5LV1sA0oBr?icN?uIgTw0KSY)~S{43e$*{59hO38jSnlqtM)C&M^zm)tQ+cao^i^VQwT)A|c<|8F=+g9uY-T7Vr0-<;Hs6DsaIjbdzA1*&wFMKDO zuf{1wTRyUX*geD#mCYyrl`ESaugoxa-i5^*%=|<}F2f|MsXYM?TSN zJsrx6dGygd3+!CTxK8ws&^q>g#J)USiK}Wn62kkrUngPCnV*Y9vLMC~5T zHU#d)HCScd(a+0}{rvIRu_0r!zv)9kH#{%(oOfc$d33+AL-B+?k;81WS>00!llyNv z%pZt@foCuAGT>G`X!U@sK)pGkig3NR;~MxCo9Janm)wM(ef)$j&bvzXZ){npMULAQ zVETo;FA@?SzPMDDxYo&g_s7%Jsg15sKE3`JR$78Nqx(|-);YyN! z{a|VS!BrBbjA1Xw)QIP&4T@ZHtkZ45CTflVS=jZ%R&3veC6F9qiDU1Zr@{~@9ofgTE^FI1`!1gson`O+tQbga&G$O!BBakon>Dtg(5Hm7)_k#}h7!`)PcZ z9O6Xh&yPc{M{G3+qvHmS7E8VG(M+kHb=sGp@&c&uAM}Sl-|XWM33 zKHqhoSxvXIKIEL9ww_T2>%#Wymc}Ro+24e_WBz_X@rqM?w-gT}#ScdD)}Y6f&8I-& zd{g+dXgzf#E+U23Md8SxdW^(3pz?01d@xka&QqY`Xa9}s`X}Ci@`F+KS;}sJ`ce8U zTh6xbHIjL%GX%3n&2<#w)GDJzl4j6+n!!s zYz%+h%bb@sj{FB0=5B2mNVcnJw)K*IOWIu=LGNEKYjCAa2&|w%cWW6flmL$+uoN2iHw%AJ6|{ z0^?NI3OcUVg^N9zcMB0tnZuh?`xV`nz{`0}w_64d!DjJ6wu+?fn= zKN>X8s^ZIEsh)pOO!8xwH2Ru^FbKL#I3f@ z_^n0f%io70p7Y>3jOyE5R+=#UadEJImKWIQ`AYS~GbhONGH=ZbJ#8@bqXosGlmW&2 zjV^9PDbThHr}br|0>)0B$~dDS;9o3%@0NKf;PmniQCOoz1V0{hN5!LgW2S>!%?a@E>^(Q-L1biJNH!?i zg+J@udhvX34AA`A=%fAm7Cg0z(D_~HM)teMbuJREH!iSH!4dapwB~Vr=>rY8HX6>` zSr2DQZthristUCea}Aa|LI{21R9e!_x~^{1iMeE zWcNP428RtJdY&G-1|6NoE9GlK>4Mki)9Y78&;#jxI_p>Dvo4-s40A4PXU#PfDykt~ zapz4Z4N1P+%(eg;$;0RovX$fq13}O9A6RQICSvq+Um`vQmiiht&NteJ2EFIO%6N^y zz5#@b2uuaocwML-Yxs6k12-49EkXDOAg_X*cT3hYW>=WwMys`NqC5q#a?jRPyu-_h zPllq$Z&?TtX1y6kWV}J6>|Z}v{nd*eq0?-xD)uYXA6sKj*8@6W{ICWXLaWzgwac1P*z4d5)NlR2#=4oz3x zd{D7Yn4n`zOZIKXiIZ=(YpT3kP2^=>j7YoN$r6$gDwz<90U-%lh3{V0C*KSU!{ms!{v{6u;m5`H_2DgAFtspZB&@(3 zQoQ06-)+M`Jd6}S7@9Ymw}#?Vpm4s?y4d)$6fPo#*M;iYI5HHz0hM=4)|2^QRGz|r z@w0#9x~O;q$`3}_XHh@OZh(>cEG1`;aD-m=Li#RXeyE%)M)^DF&juK<@1vTx0r95j zubeekzpd-7`V4-@I&pStehDP@_qy@{D{swSoh>TuW&p(^hB-3$=&eGJee1# zm5)ICwQILX%a{Z1?6MW5C_b1RAKW7QVc2!y{bZ?db#ExXQ1s_z&ex&PcTvC5`@Q=C zhjUHWw|5LMjeB*##iZVhF}BI6viotcJlW+Ym(Qd=v^7!74G%SX6H~Lu4R&U%Q#5%I z4Sp-`KHg;R3fdbw4&S-u3NPH)Hu6H-7A!hV1o@p61nW1?ZZ<{vILVG{%l(L0Fm8P? zb?#*#cz7jGQ^I61{HSdhPw&6Pe7dBDw}iP87+{s+3S4$*9*mnumJHI~hQ;qwYV-q% zyS~T&y2+j&CW7MJ1^LI{9$KCS=C(KQ0sV;Id{}C$L3jgmGA}HH7*Ph_#$6}ypGq*k zMyO?LqY&ChOzOE~8x570?=IVjKed&b{K%OE^xjH`{Bntg&P$%~4+bK;PD#7>&dy(G zy-Zs{hlObUtmIw1pJ^#gKioxGc3^j$&FU>fZcr=1Kg+Gu5Qx<*eDH){3aUN}T+DI* z8t5I~cH>b^HJC2z&$aQ+0C`uVvYt8x0xRyTt-B;yK%Vg^C9l~6-@87DVJMvg-nMmi z{`_2z=C>~4c!?`6@MROr65+6JG?sSl)=MPk?fJve7Z2V1#i`b{3iO_EIC|yc0es<9 z(?ww;lyA05+<3P07Sw9LWj23F8Cac80H=KRx%calMDqNSbv^L11Mcgx<3%E^9{y%2 z9J>GN7FghF{i5Y^9>89!#PIKO#2K>E^SEt2kp7+@SQ@WO+*o<9w-AQGDL&D6XEN?W zvko!Ip0dkedf%}pB@M-3@~pu7>NEQEG->TbncrIUjWTI>>(XNAQlnRw{8@0B{yhGW z%7F`$^aq8lY+ms;>xzpc->t9k(57rpHP+L@wDt-4d4y}vS(3MgKCjGT81X3}`BwF6 zhmpc)IBDoI@KQ`^e-On-BBODo!YbrzH8<6$b3JwjTIKa0L<%7Y2 znt*bRH+N{I#Ov7)oWGfKvkZ1z7k%mcOJux3(j4Und&1?L@)q%?#g@7`%Mn3eRoVLN zk1f#}q@3l=`__^^>JG{A#}(ws_dw(9+Z}kD{yd!flmS?uA(vFP(gGY_?Xj1yJRJ(T z2c-Wh!tmPtdb&S`hG{kqOAlPzw}S95`tY<&q?+DzCA~n(wT^zoRpVPhxdpssYG*Ow z9Zkyfs%zaSj{##uWvx>ruajk*BtdW_dDrYXMhAjx%fI!kQ36LkoL=_+4Dt^=RN*=^ zhwvna`xf7vX`x~JYOA+#p*T{5^PvWtKGO=U{iOr$Dia@))_!U9c}ZX2TpSo}*-4*? z-d$RHH3H6m_wljXyed|HJ@-Pxh+g`Z_S$b+QtLpO%<;;W*403GVDp6oftJ8Jk0ssq z`~bKj_*J&+!v24G#VNkq|9BYx@`F*lHUGn>z{dHe@Mr%Q7qMX(ASfJ}|HU^T^KSpm z2m2>afr_7{;<~7Kga7v{=X;)*Tg-+Bq=6a(~ujAhbyjzV2v0AK2gmC0js7u8?4Vjf_a|oP#~WFpBD(Or$E83uZ2;bsf1F#4OF|`IW7DnAXXsz_isnnNAt0k`ct9TU z28(Cvf3q+-4^O+V?|xw)22bQU`MMoyfbW*Qd>pW751wT*))GBd0}36QTqiPX(RCY& z-~t;Wq`GO^^X4_6nc7id*hcET*tD6_L1L4@F53lK8G!!l{%|nE6!+ZKwqPrd7GZ4R z!dRi;4}WLwYp>c^2S!TJsM)BTYe`GA$$C>hasZs@9SEh)Kh}D?2G-?%&&?Do z2Tu`s-KVK?`2I1^ZP}acwEnCIdwkpO(6-UeZ#9+wL{l!FHaI4V{Or>g$>qz-lR zxx*~9LW!6TRau0KNbU#b)^vbfpOSc#Sg-e*#5Z8qGlUNjgMNSW!6YhjyU09++vWN< z?^tmX-wV#^Ctu%z3-In;7Sk(&huJ!_{a`H{AKoPGv&dW(=N!X}#@*&7v~MSx&pVOA zA7Z#hOjuuDP&r)Tyt<6FL4$oA5l_-CjhGu_L)c{#pAMe-tAAdU5C{Av*s=PPa17*5 zL^nV8J3V1=*It`Ds>Id9(fm&9wD5zU=$zJ^tMF5%{uNdxn#83T-TRC>|2M=no}#p?P2r4A$%+uj3^06~*NuU;~!A72fCAQ?&C=H|uG2lGLWx zi=oHy#OPSN6lh=6>$R{At>?|&FQ*D*QGVLPBd3nc!?8r`g%h)KxYR2JH)e%7ft5x! zRwN4(oLV*Y9L+P#Bm9{MqJN3w2WInD$i&S@*J(&pF z;|(Z37-gR&yZ>W1_*b7LZ%jD-Dij`H`Rq@YuLs!=W8@!VkI#^^C)=YF4cl&j4F@FRoL{|hqC*Jj_+reH%{y$%ZAhI*!P1`mg$CYADCy~ za}1I4%};NJv}>aG<~G|-j4>L6iRboO-mhmk2`R(rE?R~?G|taZ6zyX3iy7L!T|qoe z;}?6l2+30nOMY<2h~2*-K^Cv{6+6rSD}}udZs~Bsbvd80YP}U0J^Ycw@}U=G%xx`N z*3CuOEIZF^$y-dUf+4-Xp9%^Vg4&n$DJgG8^C*9;J`X*|+ndi9ju_$?wmIhA z5Ep?RNbCY;H1rwcENElfe=zKaE%_M{XxI`DcH9|#H*JnzY{7*l?h+tCa_QK~E6(Ki z_a|jt@bsGT6NXFg0|ZJ(7_38jCi=69a3ab>-S>7!BR?AYo86~H+(^)>ELvL&M)h(R z(Pc`Z0+(;!D+6yLzs1Vo!d%rpr0CV+G-d?XuP_83edgS6!fY}H&9q))h?PW479gts@v$Lcp zED*iFgR3(5VhJv-3!JO&zNEQCE&6%Ab_r2*G)m}8={92i+~sz3cpw;4o()xwWE?l< zb|9WSm!U{QpCkDHF?HqfRDAD$h=|fcDj}(?S&|BO+=J{Q*%PvdlzmNHJCQwQNtR@n zvZU^~hon_fv}sdG+O-fxerKlc@AJL?oY$Ot_A~R$GtYUR_gk7pe7LgInwa6D_-;Wt zNRKLfNZpTO;F%g>VMp=Sp!u42c~(&O8RM#}55BmJ!k;DAfpKFavU{N4qtm{)m@|bV zL#+=6Bk&Ume+J38+oD@cDjrOGd(@$<2|eP24_ENsbC$#%hr?Da`F1$}d+qb2zd>L6 zzjH7x*R9$5=zi!d;I@9NPhjp4c>k(E8e>@J>_~XBm z3kKf??7{6myY9IeAqTk_O7)^!KG07T_OxB~B)@kgqO?C!a_;t<5Np&=r>2wvsW**x z?gbviwjv9^;eIXfXbq$JQK2sUTzv!t&uxM$7N^_3eNzCw`5NtA_R1S)bX;29<8Fgj zAJr;uNBp1tG$OjWR+|`!{d}`^SetnHEM$F894{`AVAiBoybR}#efKJ>R2Rqmb<{&Y zSd#PW{5?gbm_!E3Cw~Xj$&}(1{~x|viieT>o8$-kU)~y$Pl3!QalZc-f0o2WB=bqU zuK&f6A@L2^aku})gZ+(DVEfOaYLs6W+uz`S&cP`AEZHt;H$aINsn4?IEbVuOO=8V` zNdA9%IGyUhKf|3$v-X7$yd3u`gufcY$2-$p%eUx4L8X;E_sch-@(gQ!+i+BlF;|-s zh@4AKWSa)C{D*`&_ZB$p|*Be80pzNC&ifYc$?amP7XeCUfUc0O|*0v~p8+ z3d}3@uU!QDFo*O)#5aXI;V1Sj_@n*qHF;h7URjE2f9oeQ9t|~M9g0T?c{ry&Y;_FQWl$Mo~Jj58c{uWiLUIN&HTZINKm~uEn=3{p9 zLHK4(6IH(cAZSF`?|(l(5=e-?zL-jXO&kBI%)B(HjDKhh_iIIQZ;LCYkIAW>gu2fY zX_F4+Wc~3e?%ia&Gm<54D4h~&bUi@Geo*Vu+|B^>^q-4f9?OIl3b%%g;z+%Jxx`u) zlBbN!wqgT4ymKfidsEpS0wZ?aG`uK{cYNBjeVYzxf5Y4&d;Y#x?)f1(g?zf!7CRd!D~P3AtadQLKY{n0I=TLxCP`|P96LSo+aC{zmgfMUtKd_qPh%?4b?|ucaH>KV zTHn6Y$*UYgQTdiNsb_ZK*!7i1isg}<(+c_dCL;tt33_wfrq9AJa0QfbY=WIBRqiia z+Mw&ZpxbQ(3tHUfIYCdo1{I~2XUEcS!Vfck`*t1ag73cG8j>D01W$(ZH|oz%#Sbft zkbJk0k5(Rtco@?)_w(LLmKC66rsSO$7zK627MDrS5oH6|@}xIahYZ z7OFf5wbU~eg8olaJyO2hWsZA1i&N!}!Vk61L<~ewKcw2~o>I*SsJw}D(UwLuX#RBz*hN=Je`sa#$ zgx`!ue29b<)ViNAY=k?44~&ZV7h=w^A;UTFJ+~ozS$Z8enR5n$EdyswW>OJui_06w zuqr6Fc}?(8{&^@^ROcsaQ3J0icIzMRJ_3$){5;5$zZ!4NZf1sjNQ1roNhdEIt|8|g zdXOQE%dHiUvQVyqFM9W+?()1y=3})Q&iI38Wp#r`&O`U^p{P(}3F5I|QML0g72Ler zO;JKP7$$|kJ+n424}{2YH}Ezkkn<9b|BC}YeiP}P$-FB2_i(1d?O6m?^1cmd%_3x+ z)*6V3D-~+ekY2jvRzqttDKCF9IgSHd;N)HY{dvjYa>^b-j~~ZCZ1Gj5G{nsYY!dD;KkdqH4zMn2?3XFv{gKM8Ice4j)_p8jZ=-PPc zW6cxKi_gIgn>M)1uRlhPuUz-O5;3;-h0smai%^75xU3?jipiNf!2-3O38Yi#6e>;e#RPySzV&` zN2hYy>r(jMc}2`ivGeero6zdnPe||W)MQk}AiXt<5!k4K^mWFwcMV*Q#IoS0GB;$a zLDe~(vZVQm@YD)it!eUMOI&oTUzemTZOu>2GkR9hJ& z9dcwnbPio{cyZPU{rvUV7z1x3kiiXX=f~RtV$Q@)ZABFbcNowP&NU_)Ebi7IUU4`V zhLe1^0FC#Z;k(Vm{ilf?h#!ouZj?*%)}VYOiD!Lq*Ej`^vsAv_nf>_D zPSbY?uM5aWO4>&5bSJ*ZoVjB*XG6?c)Gs09ZkxU4sCY2Xfe!;@oB~$5NeI7&^b^Jg zn<*W0;#<`x%HLq*$KP|XE3)a7eHL{qv09wcry2HZ{P1OZ^^~0D{Rw5i1a%P10#@IUxAoab&U+B}U zyH~lxq|rVxat&+H`wXf7d*`&!eqmp7+jrr~*SPP5X9LK0>0I5V=NG}5qR{8h`m_M_ zKY3bVj~-}RC}i|)$PpiYs!i~DIS|T1*srZe(0ls&%3sgal|vm4Il6=(C^qIpm} zjBI`|Hg64?Px2|SalYC3vt&Mri^#_7LSMuGaAf}C8?fVU+3{fPI0aPe|NUo4zb>}F z0s9<`ZJ(vu|7SNK^;x!@-Fs>Nz$McM@U=_JOmlENc|WOkaZvIWD~eDg_a$U1#s_a& z^0pvrn-#2feTDBavV{)!z4~M796)S`*T9QvE3}U!0?*&?u6y4z$$i8a%QoTFC#JHx zniJrMH7&jMGCAP0q{{fdp>(i0T9Gp%fck#Nw2p2j`@wh;oye4tXzE4PpZl;IAJ3_4 z8i{jDh~F+z$=k5E2{;FMo4nyy!Gn+I*ve;d5L9}dHLr&A6A z&C#Yye!bPO<;oc;A>N~4X5)tKm&1Z#?ekNb{i_@4>y>ycMf+;#^%v{hbYx%Cz5V54 zj~@R+|D;u=`z>-ASbKjm;*+5WP}&q#wYO^*tfA@JcZtn|${OKc=FMmk5xTrncG8Q$ z_lesiuQ(bH*0gTRPnuU5W{G$hL1FPSk{^tm4;H>W693&Wp@jGp;E~5NB+fUL&#Q3+ zKUUI0;v&M8e)WNQi$Bv!FF&-uxJQi;Si@mO;u{F(9mdJHTi|L*#e*St8~VV!1;~Cd znxQedi#Vz=M)`HESe1K%^f#b${XGXG`yKjjKt2bZ=b~fqv^7!Yq&_>jtAdiVEQ?wn z`0HQ>$)Y-Z=sH4eC$$C^n65qXjKBt z(>pqmRKAd$@7u|J=H&CGb$yyTr~ZiFGQwXa1rDVIaDof_3LfX{7z2k3g+=afYT(>G zt-W)?S`Zs@$*4ZJ9`8ICX%c$S9N}d~C?9l{#<5Tt^LJAl@E5uwQL$^Kpzg)l;Q`;Z z;CW9oZIeY7oL%wJxIp(o;A7t<}Xt0i5XV8@lHFv|K$~@ z_-@%ejQ_(AM)B6L`4s*K=NlvOXaB=R{EOGc#*ty;8?fVU|34lK4SaZu+JLm-S<22JbX7KzY_CP6k$MkK_a(aGQ)O9>f@^Jo zg^*=&d7T54zT36ByDI>?hnj>Gd%IBgVJVA2?jt5)M%sOl{p6}2p}PA5{ocwbuw^W| zb^ByIT$|SsJ@z#oas%I2K>BWqB+YeLqQats@b~>}}h2xWMTFP64%%S;~ zL7Z!cFC3abvT(0jFpv&=C@RVA4_3ZW-?z{=0LBcTYxIcqfS*+w=U=Lfgid!=tDn9; z44(^R#lN0B08Q@H`96B82WU}Yb*7wGnU8iIQ;F<52;T!0?}%AxavaXSW(q{HpP)m{ zrv#|q-cLk#9){#RlwBS0RV#n;*fb$K?wI5HeYckqE8V9yH!1SrvBUyboA?08WzUSO za6?#)RnXp}h%ODC4EI1`@(ojTSlF<(`{gJA_H*2t|U`d-O+K3rQ5 z?uDAZuS;@8{W50$oO2-Vm#@7B0Y&nRP-`L72n1QY36hAqc0dgr2Y<&!mGLc{@u zPKs9?y&V@EFy2bm8|2S7Ak?!~uP6Dz&^rCR{?nGs#|qD3gxnBUJBjm+F4xxu+7fpp zDsvGoBE3$0!3TubMSr$U*wdswj(PaTa@_XiL7Icy-us^M@wBRIE9y}^7|V@k2N|b8 z$Hq(N;f{Zpn0eEP_*%lOTuZ*>1P4GG|dMsB-A2p3KJGZfrY3RXvX4})lOu;+49b~&` z@1;b-a+E!29?PyB+jns{obhn^IeXwP-TA#{jl@?))K81z`vvRJ{t0bkU8Ixq&KXI1XwCwam>zT{UqUXYN?Z_=e*k*nU>$vYsp6YW811-_~ zHdkrCy-(UT?QWksnn(6uTot&JOb6ZwrJz(dH*$?H@oo8M$CYUzXuK|mZFR$N^uKw& zy~Y3Yilb{b-z~+%$mRz_`E1@Aicf)!^NsQ;{8=_GA{(#k|8Zp4_y+&tZrSl*>^KEf z|DXRX+pi0w{0-RWU~Ky=D#x}Pu=QC=&T4%&y>5U14LqFnW@@4667ca`nbTPX(qGD| zweb)s7vF~_O&&t?RLjTSdZYBZg0-?UUOFimm%e@g%B^0N)?GCXJPpVH{e$rAK0)r4mzhPYzU!SL*=W-i*O%H zw5}3L;)Tv+{qpIB=z9#4c*xU4e2X4#eYY}oAmT?4_kI`y9HqAiFjPqdHaK;s=| zM9K}3^~H?F*W&%we@p`JX_NDK+F))%Yz^Fb?#%Oe`bUj=eywwbEXGU2naay! z?Ir@WnHpQ{uGb}Qqgh(*=Ic8m2;@a?O5`fxs6NB#ZrO4)p8i~u2_ZD@1oI@X0cN3c zLo-p)%{tbWb10ANBYnk-(O>EUzmb1~zrKxWD#EXk29JOdgeSga-LdJwu{JAMt-9?5uLlxxZ|P;UM#V0B1`=YOjEss`CKOT zns4gB*_?>cUZ1(jtqw9~zX;_%q~xSR*g*^8bn;>`;j<|aTXJW!;aL^pakS`j$%F0i z^v&jh@0+T@B6IiITQ4l}Wkq5h@{=^;v(QM2W7jU??Pz0D&FM0z>*Z0Fv9}EEn-^d0 z5l5u2ubH^QES|^o%0S?$G?b6=Z4pO}jZrRqJ0Z$A5C_ekr3gMku zg}!jm$K&p8_>=Mn-7gEj=oxl75LZ1XHhM06cIorV!YP|a*DmQE96Nn?fi&~#h}0NI zK{T`TP*ri(G$-n(ZC>Or_Q&m*tSRDU!s z2}hPN(kn4+%TV<^BKNiP4pPppvEX3_olyaGF~%c`1IOu$Gyk+Z^Uu;4&zC85btXdO{nHXwUt-!ScmIRBli3pnx4$0jE|r*1RHzWDvLj_;&h%y_S?r#Qv@ z{$9Uu*&}rX-YN93?l%a~=kY8A(KT{Hi$Ds9aq7?0}p~nKeXs zE1Nt=gVFf3g5+-;!Qbw&_~`I?Bg$vtg#Gu+PCzxqtRqjBPid^jWr?O_Tl@amwHg z>^R@w;T&a-;*dC2Ff0<#{7dh;tVQQGh{iEB)fYWa7i63!vk9Gla+XI)QYo$e9$fNu zUisEQDRTZ^TS6#(SL%xjb)Hk0%j1UH!M<`l3?O{hzUKCY&d}(-;0NJ(^5FC*A&y0P z5wJ5qU4S7>`b#M~2bDykTxu ziU%&OVgS_(l976k>icV@>F5#r61TlGS-6+XSB`SD$CXYbM2vnt&3d$Ha+`;58-2+m zEq`sh-82?wm+(=aLp0*bNn?>>JLb*X`rU`*wO~ef5fr{geg}2Nrsyt3;~w`vAG#EM zSAOnn2$_cqMNO;9=G`bZ)DqHt;bZfLxF630v~K3G2VOEC^4kd$8d;hYc>mF??7e zzW{7;`qgzhL>0wvt<;f@48rZs^1YuHDg}%yeha)`?;_VxvGhwE?z6V|Yp^S_<1?P> zTgEsNH;d;^=00x%2^ND+dz*cL&GKyC1D;Am%$1r@vw3!S3g1=Lpq$+}x4hV=my;EM zbJxjMyWx{)zSj?auR{Cnzck~(19IPeFYQO?`@uiKnrhv4pd~ln7yF{SoVF&|N3v$M^3Vv#Htg zpXP_sJc^XJ7A5u2#@y=~7s;(f^H++za$!4488254SmDX9H@$L$gUs7BjoII=YEVJ< z;`qJq!pQHpH@@n*454F1?msBr8uXkS<>Y%1`q#2aoNxLxPfFpmSeaQ|V&LN}p{!fAu3NXE%*qkY01f1>RaIQCcV}f%~4hbxo@6 zE7NI9{%@6q9E2);Ra2ZtIa4BIYfoLp2RhyI_s7R9PWaZt?=WjGc|NMSUi^?5nt$b2 zHC0XY-QmJxvseZ7*K|X``zMW6G*bq=TR=EDK(L zbXR4~4t2a@-p<}zK`O+X?{``jPAL;gB_Cs%`d?_i-P?mtjdwA>m|vWkO<0G&OU)Y> z&Cteg4460+Ua=-tKG|>YDdC6uv)gv$zyXvl2?{$*KHr_b0+SIawYqt`cOM=$q^z=b z9(k@kmCbRL94|KNmc+n6Sv~W6_#+kN``4QurSvn3SNuP|+rK=Fs653F_8)J}Up@sk z&Nmx>mW_+Z#_M9^$guGZ{)@Z)8xO{gQ$WYzl>aQ-uj@a5gTLossDH{ni&DyNz}9Ek za@Ov6bHpi;H)uan*05X21efMteb(@woRz&SNIcEG zhw5QVbSIXP>%zE{FiOh7glH{N&+WW3vHlQoPCC0Z+SnHHj88mve!d-syD!<^!^~#d3v!uTLXxAZn7;(nX)cz}2r3 zwDH?8N-_P$b`ew_lj%{iCMqvYo20(BV-`Gc+Wjgtj@4QF)NjvEXXJl=BV&=(C#Ht% zCwsd>QXhYGMknzBs$cp7$Vw4=7Y8gop+SCsW3ul@B?#U}*Y`bH)kOa;G#wmphRI6r z`E|zYcQDOkwN=OW@5#*Lt-Lk+FNV>6TlmuXPnm(~Zl0?Wf+Sw0xz-U#`h6}j7pX^h z=vv+@ZO$&hJ@3tWd|Zpc&pZPsgcF3pEy*>ZcUKBSte@+aqWcvldi_d2n0~@+8afd> zV|}0IZN|vEAlG&9Mm| z>Dysoa%64dG4vgZ^%pD}l@BA%@=MJe+Yk%1wqxg`xNXq5vYN!SLy0Plz`enT5Dulp zXm{jB8(dM>5xD$h0Nw=dzUY8Fn0tT1QBctd_x*l6dA*m&bqn%<_Mu(#e^N+7f>7B6r7o1OX2PLbb`{zGp~2kXN4WSr&4e^q;=OSgJ*=eDIXw zyM3c-G*0p`qV=_gYjzQO+Z8F^8lLNmPR{tZGgXfSPD^Y-@wMMdN&H#RB)NnO;Ucn% zn&3}_*TurP^qXH;eq~~XJU;0{YJ}-4D()7jrg4X(crbcZlm;26fclltKdz&U_HkIF zpmirv^=yjtHvld_qKV6+1@xMNrCX4F7UUY`a62t;qs8@n_Ndwak~XWh=kTdh>j}k? z7sSY_2=G8Y;>nAC7VD+Rco^rBn=Cc|`o-08AxMuO78f}=itel4}dNUl1VuKs3D z)Ms5BksUY#Sb@!rDPce9uVOb16lyO7b0T@F^z|?3oz1J~EpR-~?9Z2XusO-8t z)z1|Q9$SWlH$|xc^!(Rul|Mqfn)d%`7n2Rv<8D^9D@lFVImw_ngE*tGxZ61L2$;Tq z=fQ&-Qhxm96=(C^vUwQEe3BoG&0F*TdDApWCCfb4@{LAW>aqCCZt14=()aGRt{N0dBx6Zd^2y0u9Eu zvo|BZGgQ9fy5e3`o?+XY9D`B`Ewzs`6 z76%xB9z*ADo>4Ez$ZmUPX{!keRXauB+BDHOV09hFSLon~nr#0LZX1vnDjWH+!5k#D z?lBQNQq58%mZh$0H)jsc%$>fUeU6?J`}D#7EE*cej(bMyhM2zko|`5&0ifw@D_ywz zAj)TmaiuF0YP?=nqKbQfzft&hZ?Q;P=TLRB_x%JWOMa2Xi<<-V%n;|O61!p6*xs&3 zFWs)9`}z#G`FdV7pJD9b3=d((a?xDBeFqr)K6>hy;(o}#DtfTT$rcPxWaKUvy~@Pe zRtY?Ka|7W(+oeqV++=Q>*ckUJ=Q}+|jo!A)Pz6RldD|i?B?DGXa)o_T;fJT!tp0KC z*>S+_A7~aOngKq)9m=-m2?1-*>92V{ybo|~{h~BrUJXjZle1Erx8bz2epN+%-Z0&Z zrl@9q9N~bila-RSB4UXAYjGtB;InGR56{j10O<#*8)RLG$&uk}&TWA}Oze2qr#5?{ z&kck>=~BgA9w{06#aF{+CW$L->=_V0IXAjTp#s&Dd2uz>2R*;@UL@}}BYviKXb}04Hy|m5LZeK}u&GCoo!~dS;-~9NIv}S zww=vz{(k=t>*(n)VE8vTKX(@FTsv}EM1Htq>1xG^rmrI%$-y@=(^sm{og+$!-%mp5 ze;zj-*1t2zV!XB?P8O|(yTxY*CR}j1e(&_AxJ)$|C+#k=_{V*w;N*JmQs)}h)9&-; z7zb&e7d%~Uwul(tli6QsD-Jns95Y#8K-r^raQ<41-@V^Y@ruh2U!eGI(fwMOuCx`u z6+1}rgK1PoB)d;tqowJ7OfqH?v>mFYB+fV8N}a-=g%W=@lema( zkyT5GXLlcy_y(|6oQk`J2Sqr_P&^nZ=g(_J|3UBj&fj-)wh&25T zpU~sTb1>*K@D7XkT*Bnh2wyRNXFgsf>+vP{!w07BT9t>SoPEVikCw}HfdzNo&GZ$g zAdkgJQ@ym)=)SDZHdlIw% z?(-lM-q-b^f1)!EraaL-naQ+i)2p4>%}HF%yMEn$oh+Ua!(p^lPnj^&)Sk8!;e%5{ z+PCstV@n_k%i0HfKA70mA92qvg0Xy!N9S?@u{}-q5*MFAn*TwcXV4s8i zXP;%;4gTu0Y&rYzo7!XE&m(X})d+`doCXx*+qWz%oAm3$R3ELR{0rXePWvKxi5;n; zI}z;@f8}fq*Pk6NNZ*Z{tjv3+qzcfH%Fw%hBeWi!GQ;0U`xv9lD2B3AnJ-%Hfa?1n z59^4*YsLgm_(My`avu_G4)=j)zeqnNRyl%rJKsPj8DmIX-z^1|oS`TK_ihTm$kILj zp~orI5V~!<)cS3;6XYajOvhRQTo^2T&1t%UUYhKkzp{7>EoJ|rJGPfPSQ77F<-imT zG!Fm8R{Hfc^}{84Bb!a(K~LJP`5*1b{o*~PJ4)046MlBl&j4uLkYIZGSSV9tWaQeS z=LcxryS#7e8@yq8<}cQNGBrZq5}7@C*Wn8J+$7CO*P!QyvFzn(j9?WVePD9e57w0~ zPf_6C4~CreRNpk~Kup^yeqKmDvd>C-@4LoC=RU78o(a^^PG3AbY&50_$KKydNHEp_ zij75$%dz!PZN*K0Ve8*?Cx<^TE-cH0cXg4XgsLv8sCqq`VOxj__gAK^J`4=9^is57mIOU+*<-wtJ%a zYHbdA=!g@}DJh%>lEJv(?Z}cOFTC_p{qN7hIz;qhi52r-7ek>rv-RsD+rjZ7@N3Q$ zJqOspSTf>`$DX^YgzvE;&Xl_i?94HTC-&_cs~xCB{N``q?&nC(_x?FOqaK2<*C_2n z?)yaNXrURa*pirDy1Wg3J`a8H3+2}{>tVjCNc6$lbHFpFqVb?Z9dLQ6)F$9^xx+h9 zLqYU$dq-oY)IeC}GS=7HHt{t(_ON<`gsxa_dd+ePV@WN2ssJy38+Y5RWdJp44}LWs zccZT}o0Y&%J)^lX3$D$NxWam2omF*RA_d8xd2)s)e=u{yleP+#C_+~?2cK#NrQZr> z*R3V;i|eh_{4Al^_RGE`uQ=M*tlW4VVoj^cJCcVn#An{U3CK`T}>>}^WpOL$6qx(|r&E4P2Q9Q|E z@yz61_}JQBK`wqn_{jBHa#6rAeZ_$X50-Au0BY*qa*3=KR(~FNy-({7ZRPA0&OQ|1 zgB8`W_}+9F9`Ni??6BQsu}Cw?De(^U(S=J!nm1or#G*IMc5c_~Z5&`qPT( znt3lf1i~g*YhSpyckDN0xfxPkQ8CK&Sa9i(TcSgMr z+$A(732(bWQR9f`)`!qJ3d5^kIy!(Dx!wB0#5EA$X&Ybf=-f@-w~3W&qX_YW_m)o) zo(ezLea5N^Ta?eBhpUAX67NDROYiFZKd(4i2a4~O&BOTr{9ylhYuJ1WY@Ba4{w%8Z z4;S$-UKbljhK+B)j=M$W*zsUgoC4c_mhIQY_BZ(N9PFQc_MhE=t%QOqbp*hT_4-(lS_W@W4M`}C7Djjd^NLl8keGaP1-<3>?#7wIfr}p5TVb`{>076i za#?&$i3htRBx$kHtJHfJRnvVc^0sTQUj%TA3FoBlfARPB~u#yFRQ=*Ow^(g+uCw@Ee7cQ?zszvYj3e~Z=8B77hj2< zOeN{>l>db5IeROgjcqv(M>MK6Py2KcM_ZayJjFf)z zkT%YNetw4P>-HXi+{}-^CcfK1rm5)7nSgWftHko@VNN|_YLd(I`7$S%THwPBq?bTU zYjMuuN_*m+jpm7mM~}hK3A3j51Ef8L@&5FvJ@Fy$`nf$dhu~n+E&KLYdkBkMui-0; zHxb_&r8k~RZ2+}}4E1Y03@FfPm9;bd9I6KeZW;L_zgmT@r$v1Gq) zpT8$FV}Bl0Yp7_p;i-f%O1q7^9w~QtW~W^&j$Yq^Gu=tUUjV87$*n}?tY+u?a>BjyEL91S&%^I{?Hg?AWju zZ|_f__-^CROGfv)#4_czGCmCMwW0Z5GdqHKYgi)z=Se;VdPVA*mk8&ZjwMW{EW3iv zar*=>D()7*+eZyZye_(c`7aVjhR)M)(;nd)kn*tqQ4tS;zK`=RSna`AY;7as6o7gF zb{+Z8g0$9%AIPr@%sHkn`<7}+yYsNaA@ynx)8%NPyY%uVn$JUX0n%=8Xr~vY&ptoV zRYA(xx9@1*2a!Jm5(CY<-*uz=YU-Qc=}LB8u)Ep zR(5U4XL{X0#TZWPhM2zb_<^axpLWlHG-Z;8(h109~AXgJqsf5ZW>a;7BoqoXGh;z+%XQF%)+4fl?zeG>D_0L0l@PN|ns=iI)RpTy%Gu?rLDwrc~` zrU&0fz0}ElqupYQe`40syf)fD>>;NfY2U@l z&$-1CL;vgs2ma}^C;!P=?T+W!J+cF+9>d-57wPYSS!I-xcD~L}d)*UJdLqxtHWt_7 zQCE&G*#%BY&A;&2+W|D4NZNAr=pN`I{A|iGJPvNZ^Qc99w>$X8_cAnbK{P9^EwPbl zya;Nw=L;{7X=SbZ8A+4~(SbDYd+s}z_R$p^8h32j`RkJ0;9}E8o_JPox586vndj)a zD|MSYeU{cTJ!g^oT?npLGv)amMfqpi7-Y-fyA_UA%SCR1yoFCTu6!v?!xGIW`JYQM z`>roDD1c3@?WeeEuZ5M-KmQ33KXfY`-Pg{7kB*RjUX1tk!V@$zmqCg0m)cP2&y(AJ zHji1|NADDFF(CaR9Q%{GOpzTqWRIde1C^ibDg1Gqme(+t%_!x9t$MUt73L&;{(;** zvl9CfJu@+0@-{j+ut zd<&PTL|D;2SeE>t=Y^pQe&5t%Zq#-MVI?2#D6JO{?un^KuaRtmjb2#w`w`?v##m~4 zsXG=gy`2;EH8Fsw;j?LMP$&c5+bh5N{&s}w`Z+=teOUlgJv0B-`NR0SKuxo-tqf#; za4z3>UWwp}{q`UtITw0!Hf(W{XaH4pF~@R8Ps1Zp$H%aUCNO-~eD~f~HCpjlm6`SS zP^MGd9X)BT7Fq-6y_LUL?MLh8b$V$N6<>%6>yfww$Wng89It+|*Sw%}nsz!xQ}5>7 zOS<98u~*W`?@@Wo#^%)qg8uEk{?oUD09`BBG>K95oOi4vjK=LB3@cLkSf?W=m9J%C zM2uKjuWr&s{;24u_1+g(H@xAG_-^U;O?^FxhmmgE`spS7QbRj)%H6( za$MLWPs&b*jjEI9nkYQ}o*R+rc%Aa=nw_`Ane;bkVcJsXVCh|EoEK!KY4yioeakQh zZfDcd-uPP)zgO5`vU6h#Z9*%yZ}4g|%$@11w_YR;N*)C}1YINTqS!E>h7j4lR!Nu^ znND8pq5R;Stu*b4CoUb6i6#s#Fra8@L3g&lBIT_v;g_a;KX0 zn{q{jGb&|N_7uH`(KwoQDfxkEacn2AF^S*1a55F><06iRyv)1zf|RdV=gk0d>VNmA z-o%P4TirYL93WZl)MZyHAG5T<`9oH`j;Qse@=IOG?`AZ?8VkI6?#N9kzwN-u;XsYa_o38cANs~Kg;&( zV*4Ag&%sb}wtbduH(=|tl$`Z^_s;UjyCL|YXQR}b@4-N9#UTmfD~{y4U>lExQ0t|& zjoZ=Dll~AG zwpw8~nRe=ZC(p@{gOw3^?u~FYYElWTwQp-=9kPR>r&iP^s=o5iO$7gyH47G& zdsF*`aT*d=i_RaFJk3?)D>PuI)%}QruRb!J18(nMLVL*Mld$6TO&+0HO8&X= zC4C#vKCu7De49YBK4WsJ9@TDXRWo|NFjuL18C)6+jekkyhvB#QR;ny91j98qq+eWd zLgi0{r?#v^!_-TqrLbW0S&^W!^8nQ^&8ppxqcn8KP8BlM!oJwxt0wt)4hd04UU`x zolgr^y~EM@c#cX|bY&PSXVo|_%LT`JukPrs4M)#w;$)hA7+!iPc|POM1?J%aEES(? zWWH-T_4=3cbEffAj|Jv|NKS+Ym+VlGC+9IN+iOnthb0$z;?9qF`76F?5kIzzF7ZyE zVMY3UdRy(Lipnvx8|!hx^LLwSjzJxIKMs$LET*1ohV%tjymLeF*FZT^|3#ju-Ck6C zS(D!Qs;6}vvvHJs!g@!7@yrVpuQ-SZ^f5tvx2yvvG#;;CBZi-RvDA26PKO8*A8Rw% z{g(OdC&i}#oZ{y`BAjo;gH^x&1TPwA=|tR7Ch~t^Y`E9kAWamWqi|%vi{tzc5M?jNuC(9K$43#EF)3&LAD51T`OVK9}`kLbVk$@V4Y?ocV? zsxj%GptV1(V;7<0E4=KP?NYcxwogT1jt5w8R$eg2EC<0YpN%f$e_>%wp`*tdXPAuc z!|B?;&Cq_-#v6C55{w-ki(P*^;SzbmKFpJnc)6Ue9}gdsIeT_(J_8%2tt;tan3YK3 zSW}5XQzLn9i!owE)=aFO>{%TaYIE*Xbi$`HNz@{}6nXcfFeTIZn-INrx^q&*!$0RNB6Yx_>b9 zRl!7s*R}}JSNmbq^ZN_;d<+2!?P=1pxi(N-M3_T8VGl(8diipbxD8k*quqn5?=ZTW zO8<-*<7EE4{s#a>Z_$K;cS8WP4k8k<=@s_2b z9?LK3!S*eGjO`R`02Z__$!@_1nvMOwHao**cn))Sb$%}w5wdkE>%*f^Xq^_=srn>? ztgp3L+m>usYrd2Mfo|(a_#De){DWL(7=kwc??ubxS{_#7(`zu3p>UhV)XihgVzzsCs#!fp|k($WVl4 z2|3>3R@y#XLg@=EG`t8j9`o896R!eqR$pkD!-I$iMm{QO99FoI_OpVn{6mD)@;3WR z(dWRJ*Pd0zC)>&T;g?-VzaFitF$gaVU7x64>_PNOme}QpodbFy2C}&uT7a*u*plmC z+aPvj_r)Un5LCV!E@?%2j#=TC>htKmDv`O3ek7JK@E&MZO&zEC{t_!G2p*tu#uMQl zU21qv42r3dH^!a6_vYv>3MBhEX^n@HDOQ8x9mBp4+Es7ghKs{fp{sTk!>eg=tv4IJ z(f^&M4od3yyUq2&UM=P?JH`Tcx}SUuFID>1tupPJ#`$nElfinl|F1`SN#9 z?Z_wdu{@RwK4=*UaGAA$S2cs=6^EC-OX1$bKXMKh=NdnkYL0 zT{D_~*%0@|9#xZZ3b=0CUebSdeav6Kt}?82Rl!yb!c0s=^b58X&l?ik>7RFy;NSZ~ zH$p|9aQLH4>9dKkC!9$+%O!p>SG}zh^<#d$>=wB$SmypNQonIFSYj7M_SYQxU^Bk# z&E&gRpVmPqsX@-0A$0Ilag%YZo;282DCxTOU=u9x8`a1gMCYX@f9|*O??!U|nM3nw&)xkfAH6U7 zk^Y5#&RgAZjS0sbgElG-7+bL+h05#x57Dn^}yp_H=EqLi{5u=QECoK3w(>z+3^M9v4(Y((FU=;s{pR}a(LDM(&3 z+U~>=cg#LK8vo-8C2}kLv#gBZ6j%Oo&lw}YSn@LSwqydie$J28y~*;Wtof8YF(0#z zBd@jAc)3&grxu`d_T&WK_SWB|-sr`-Nef3Qrln;`rM|_s_^NaEa^Yo@Oc4#0>}fF_ zc;lv|$LJ&n`aj&oTXY9O6s{)r)P<7ct=PC*ovg=j2=^qIGCY$BUvjB==r-@g?>6wg z^nPs#(hv2Gjm+6YXH4~d^5xgGTTNTK_Ij-)7H5=JYV`=sPDz3pE&va~l6XS9aCLW2sFu9%ql$rN0!FqBWONCeVXrl8BOCfJt zeF78hYCHPX1o>B;v)uSggc$l5SRMVs1hvC3m0xrarRczM-QWAvVVfm!(Uw2*@gXzf z2Csrk%!OQVkLKrfKx7iVUzEjaltPHRF=IhFn)yiHz7hMFxr?gLLVPy&mXD{-?1&-P zU1K^cg805Ay>RsjE707iIjnDXj)fgOsk z_?$Ph*$S1<3p5_zg=0T144SlUMtYw|(fB$W{KF})QXZvTV6{YMhwqtOaHBx`(bj_} zfqLk5&5@on;PJInC5N80f{+6bPo(Bw01cdfHkwvf1O4TNz9P{C7w9Y$TDV9eO9;sHm2JW3eX?9DqJdZLtfAbyJVu_YDkJVh;Yk}tZ zE)(};${vXM{E8%UdcMC%c1;C3t%;(}qP9e^J4X-G^boPrnc@`(-xe;jLwvVTc)KFW z!w50EQeVqB^nKCh+xn?FiFj>G@hJdR&dGTQ=Ns;&&z(c~vuHdRR&~LO+z-1857P*r z8VW}SJSz{sjqnY?@cd(sQQR#wdS_YUJ7P-g4vNZ+==C8AiDQhutI5Pe&34MKt73li zCDPwubj4xn94x;dQubNY?zz$hl%BGjOMu*Wt?&2F#S%|V`_jtA;8yL)Oq~U_^kKdgOSf=T(BDhX zSCDb|Nl$iA(0Zi16`0$Iui0K4226|ZJ?%e`2cGbKs403-46qGdncN%o(E2bmf&+pG z{H63Mtmz=w&=JaC;~zq<8^hH*g&0doP_?&0c%i41xhwCH{wHW7AEgpczF5xlN9O@M zcE1BxZ3C(Nk8R;Y`$AI4VdERH<8Dzgc03q6PJ!~DW&3rp{SDaXU~Ky=**@EDz}9C; zIm=Zkb7pGb2@G4gNt5r86i7{2B(bdg5ZG@uFWG#=pIRs926CRrLo-haC({cEcY{p7 zou%P{v6BULUsJ%@_9tB*?L$HL&islEhP#0DkM;F=^iasVsN?t%A8T;=N@0&dycO_$ zcJB29{S=T-e`6#rmk+|l@2wHG$p%e#!cs22bAp?DXKr$;e`C)1{9e!-;{+Bj6*J$b znuW$8(HG0mBSLe(PoFeO17C9Et#mz3Q~Pq~x({*c{`t^p$K!x-NbJq(T7@U25d7(5 z>xc_eJ9%F6SOMdgx6W6b4Ti_$=E`YH-2obYXfkULIiEg-G*7}&zj8E7&=(ZbDqc9! z&%g$)o?lt}PXMf;-C$72fWS_iG5vZ7y{|hD`{&)%B?fxkjs%v4K;Mc>-maTd;XCI^ z&55}*U~n9NyN(eGx46HVyin5$zII(K*_BWY0_*s0uP{9hYP%TVLSPQ?NfLNcz}o)juG{&^pMUT$5)mK1kftm{MKtmJvPf>~aw6cq}}YU#u!nJ99d{C|BR z<^M4DSu+g3y>(gF&4=gVH7RZVOPXo$^Rt>)yyfMn9RKEx*O1;+O{@3XXN%^$P;=^! z4`FVbM_U+Q1|t7F$%(dWhA*?0miMPs!05$^@{IgC_>ho&`>OCdYglsFg?n=!OG(B} zKuP8kT{~omgR@c)+^KqEBq+2BSndpw=({A3;ukW)wkYg|#-~qo%f2XrZ_nn+n5*r; z2X&2L%8E6>@yX#)`WW?IfZehZM6J}U`4smIt9_BviMp#zjMG8ydEsJzEH%I%S_l0WXoq5){iok}JPJ2_l5Svjkik$}{0z61waj3*2 zqKNoz5#HoH-jNj^h)1z&z^y|M|9M|=7vik}vuzZg0`$I9^b_HHQ~j3Dr}PCzUb#7O zCm@i->w@Q56pjoemTn(H_y$m14|A0hvBD)+-BJiO3&5eymCx7lZTObiKkcOd?6&H^ zeqBeW4ttaS2Gev2>KqJ>-aPFEA9-K)!bQ|~hfNZ<2~o@6xrmgruZRRIj@V8Z`Zl@G zrxk~sTH+g}r^JEQ%K0W<9cF~qm0#P0SJ>hb2Y-Gg?1ORV<$((m109L3_CsE!dZO?| z!Jg#`mQ65i>7v~^b0WaErg&+^pKTyWcGCkvC3)zPaAPEuqlKmRBE#+^gH*} z`+k1k{<*!5bI-YF+~<9s=XgAvLF8FMFPf7+YaQtVCfJ^GZ+Gu3D|G*8NqOfkw0|wu z-SRNUV*1bSy(MM|ZJem9mq8#LKfp5`deIS@xE^y2U7rdIUY&k)xa<_XSX%4k793@7wxI@hLwEm(teT7mnWTsSpnV6+~W;b3E$e1Sf+BG$9Vu8QS% zFcmJ)!k6z4{CnP~1a3`vq#c)21pG=nGmC}x2z+lmf8_dWuP-DVERn;C)UeJh`pa$JI%E>Dz2)JvehnK7g7 zUK|uS=GJ2U)en~471=)}athphXlz+pX@OnN@GZed+Cb`K3BlJJ%h9+}>K}aw!ycxb zDmrP_2HHo>`W@Z!SmXUv=Z?ALyL`4dPzLyqCHh?27mOY47YQ#d+lvn=J-=U{d=8F0 zHro1Y0S9!y)c51Eb;Nz_Posj-`jwdWy6wg=t!K2?Yf6Bt>ei7xZFQhyeBTd?nOYQw z*3}ukG6bX7PEvU&NdA;q#GXm_!=$wB9AhgyNS%LJ6xL5%O(=&%(AcN{cU z-f-LkemZ$nP(R}>?EyCrCOD^sK}(Tw$BiP;)F8C}>5=v5xl6i3tSqseMQrk-6Cn5&LK?v3p|OpR36n74*+m~CfSyyEB_@h2>$AH@TVvQLCvvA`z8%?N%lXcWr( z2JzN_11mBKJ_T0$<8m0<2b9M(4Pp5G$Tk8O5k@_)9z=Lu&{VR3z>xt(k7Dh=`RU># zx6RX&pDEy7Cw45z-qXjLLSuW0JcabIIyEAGRE3QlV<-UtVyaC$hox}cm#Cs;V zj9-oDr?5#`1OL^vz|v>&>G~c*&I)fC0XD+dK!wY#xDEQ!fPSfryFp9{cb;L`()ZZo z^n<6~F?>~VO2tO0V}4qg>rrWk4YnJ>qut>3e4!LvVi;<3xU~#yEumhdRILD&#vZL^ zbpp_6x%P_Gsaoa&m($gEW~EH4!#T<-tE79FfvbpjGrary2y9~qIuE}M%rfyy8)@sl5+OQ64OGt=L0Y{E`2Nh;3zR4 zTKhd%_O>=Q^?!aeeqKRhKWMdbIAK%wo!KNl`2$P!E}2hUN8x3sc6rYbH~4*{x>tAY zQSj49V)_jw81C8rcdwb!UZ6Y2|Lnt?5OC>a=*A;lA%MBGTrF`<1&-uDI_7FagQK-$ ziLdUrF(zw!QW^bL;8@M(r}}L~ehQ_1(pwsn3)nil1fzq7vMFWk6A^zxq3Aw0#7$4u zOfR!Z1}Tb%?FNIH)Y0b;93+45Vu(MsOQa9bn271z9=B$$k{P6u&)c2PXDDR+sfMek!vV+)ni**v`eIr8~`gSb* zVNrVZff#V;nw-exz%aO1*=^F`v?-uneb&9nS{=e07$e4%2}6%#ox46?fzsyN(@vUX z!_YB``OEGYc;G@@M!@nL(Cf^Op&Ao_TjT3;LT+Ays%H#0bWN8K;~h}n=!k!M@cCNv zrbgJ}pB$K37|t56lk)-mRpiDx{;m?ZJ@IMZ~kW?mdhQlf174n+fxTlv(?=` z>R$ne_Iy0tq+Uw2mnlEuhxV)Unsv+Uu-o39!LwoKfv_y!K1J?qm_IIBN)lzg%@uPO9y66ID~ETEowUK17J>^<`ZNAo)qSR|_r$NY?d)j0^zT6v+cCZ&!;gYSf*>!h zv-r0P>s(LI8MDNeEc%u3$Tq=;vEVP_hPn=BF{z_U_shhVRz_(ZO| z-uvZ07*__UR$({lX=3p@s;8yuaqDkfJ4@yL;JJgy2W$PsxtqvSxM2{WM8wbD;=K4Tt_#WWkUvY% zb5P2x;t{?DO0?dPWrzv4uj3``25Y1v6$pKHpSclB&UWtI<*e6}4C!SvopcAn&iwr8 zZS<|A)O}quTaD!P;pM8d^?#{0`CienhhEdxb&VcCx+xg@;u{!aJxY7FG)jG!?@=hR zd~Lz9o(1NpbyWV6pfcLHfrm)Qva>X~q>Dc$@J>(7-t zgY6r~yEffSX04Ze106e*!shyDrxz$cCA-Y;);~Lg_>K_lj)uzeqH%k)4}5)`^p2o; z*&L|d?ThD6KdC#{u^XN~dN;LyL<`PPMtg$>rCIF}yyECNSbVo64Dy@w5Lwt_vMhP`m-rKH&#L7nXgN zv>TB6EGcJSq-W8hgkBNPPw)A9ig6htYTIFH@$5s-8Qt!s2Giek;Hj5G@?3Ho zfP3N!qev$%;y&lAt?P;Llo-CFVl(&)Z`<>s@Y5NM)R&2-umEfDkAIX*}!; zKhq8r;t$^(J=Fjr8SWFBX?b8pnQq0b3V3fU)BuCU zU&(m>LU=Tkk}ivggZS6|vuCd*S3}cX?tlH5r-|pLS67?iNt(O+=bk6Pg*@k)8+-I| zfj~*Om$|06#S5SMUo9v;YuI6O@9H90#W=3@SA)=(DLj3)ZfLx-yhhh7aOEh25(&NQ zK>gs|tqB7eV409UTlcprRFCtetTw$Z0@4hr*t<#QveEngGnY#P1n5rGe4!(>VN~&Y3wHO$pJDIY-ocyRD&%8n8>F zElrZ(w`tqeTDFmO-^u7Tn4{*2nW5#|iE#x#*vmR^QLO$bVP|%W2?n_75#?>^AHuP* zYqWvk$S7J6g~s8b>&%qs?Y!GKdYF3TroEHK(rkv!Nwy1YZN*fj)Q$AsuvfG~nzsnS zD~|AB6nu5(&~vmE@$i>25N<*Lqx<8GQk#HMg12VMT#&`5fcmkCSaPa%-px}fqhk)YbS_e>=*R6dcHg9Kk~!;Mi9e(M77fti-5`TeGOZ&c_4W|`TkTh6 z`8}o58(DrRx{+}VD%WalvW>?5pNp^ByI?~+@3v$YU7~);7o@)y*R+p!_=XbYl;7?D z{OPonq4?{)p~19Ll%J0NX^n{y^(mvnnz&gwt%q0LkyXxKq=nsS+c0VL#Tk@ZpZhF- z3*{5gZmSH{w#ruNaA&oI5H%@0jmFB-YuCAM&>E}i=QRqy2y9~(ho-TL)vHmvm225EKAN- zHm2)WbdJFi=W}l^-uDHDC#4!vx(C0Zs>-2st z(Zwd($oKPQnugDrblIH~d$yOOc&+|{A89nk_1fd(Mbx9z6A!=l7+#xaK6Uyg75a^( zFSRvSerNSpQ_O**<{BGzw=)F&W2-i}++{j7aSmNAq5ux@$2Y$y5Plr`JC>)l}8Q|<&;QeeQXzp z3-?(kO`@FM7wm~&zPoqx+b3Df`Fk_YIyueEyCSLrLho9r)5>17=5<4iY5j_6*A?T8 z)N-E{?zP3tPZas4m;xbme-Ynf3wG453{S4&N)-3D`Kq$&D%`H>gapA4W_zHT#an~= zng2D#(m$L{Cmf0ASU5VQgPm6CBXAKxbI58dgxAG<{nU}bkzpDv-#34{Xq4f-EnEb% z8>M=0jFY^YER3&j`LlCK@Ct+5h!sBzCMRdNp|~!%CWdv}llS824%B{Ib=omA zq95U_YQn^osg?YRnwQndNyu5xTc>j61VTXCz@9XDi3(bi?2%WiwoW7arvm1<|Lr=>gBaqP$Cl?LfdbM(Ag`D<>m{IE*b z9TPC+fLj?&Z+(ezIIF+#BJQJ$jD=%~mv&as>ph8bicYTytG!JDf!J@Xxi>M5b*`oO zUD3vCK6f8}^lK%|-lSu$^2!MyF&^y%ccA)?pCO+Sx<1Qu$T){Kt38;$bBD!ARa3qJNT4fyDVH@MlR}L=vxy#F0Ud{~x{qnRolYd@xp? z0vSI`#&wbL21Gyq`N2s0?EmZrgg#5k+1v{2jO#&Ta7|gYisq;}NEmW8ky){k7|(oQ z0_(dsn)r$}|MUx8V#K^D4Y?ENSBgwdR}xO=ue|c2A_B=HO2^h%P7GZsbLRi_m6B*> zY%{JTF#08fx^=@h?v7{oX*Pacw|`IS5%oQlmcL|({&nlz)1gfCH#l&Qcn{Ej&aq>& zvg+@9w9SZe%0RIaW_@O5@`IXj=IF^I@s-&Nw8usf;^h~vQ*Epc{VDaoN=+KoUZL+@ z#faIWn0cdX2mID;U9KeM%o+!0+jgSe$7d4X68&<7$-ZT1$Z;gANG$px}dIH{=qhQDk1KB0R2H)h`<+!`C3$=fSY-n&%3QiJjxYR`#5;pIV2X#Aruo|XNJ zpS}3O4gIFHZTRYfM=f7XQSuDL!_vCrRgYYSl2@L33iItne-Hkahwv$Q|D^B@+gG4G zvz)_BphMg@s`oh%H#E9+MeA1#M9r+WlQhQ!s)s-N1v23MvY7bxA6f85!1t9#RR=LY zPhO5BCnNl^+T(i%4_?>5G_)|qfeIkZ;*HSIiBcCNlA$%d@=}-<~q)#Cr+abf<#FgG`qJWzy z?%lg4PypjfxxCJAYZr|-sV`h_#cu{W<&EJkG;B+m)YYu9kLVoFe0DzHiZu_h^O{(j z-oBZf-t}l6D7m9muB`UvdUxV%GhUocPFsm`t+p@TSY=YTz%y|GM;AGwxWk{&?y|2u zBi{Itv5IXGfv1UJ$EtgN+Sa|Ks$SaZlwBJGzvir0%Ubo8N%>vB%T3|{iaY6c%M3C> z<8F!=+0sY-(4WfUVf_9?n8go<`nelW&+1?B{rjC5F|N$o1xD+oxjBSCOG~I`;UdDD z-S;jbye^sm$JFB>xDtOfd-w3L=Pt~-S6;o$be`JNoI35)wij3CIiFzutXu=v&w1&)e98oC6MdJO4VPgRvn_!q#MR|W-sRja>Vc134E!d3m?b|mcMK*F_frO+Xkwoyr8x_`S$dmk z>QYvF6cKHE{ASm(MeE;e(O%&fY`h+t_ zJpi>=o~m$m7rtX#?aW=L;7QvH^g(r zYVmJG&n304>FaN50H^ug`}jtvtG0Kwr}Azv|F~|~`_6r+zDeSrgaw4Ba7Ty#0Lstd z(^ce@Lf?UvJEx9?5%cjl{_{UOc>Xg_Z0xSRX254ROr(yx zJW4bkepo!bc^?lA^?Nz6>0J!w+}Uh&h4&bew;Ug)l8AUq%He6TaO~>~LDzR<&Zz%3 z?7XQJ#-?OGah6F3M?#0UraRYzEa`K2c$_otKgh+SD7p{RZWuTfW^ILy*|c)sU!DS< z>C&cshbjR2><=gHK8D);`Y}q&6Fpzs|6#??Bv&k3G2~{5|8zVkvEhQXq&}V8>5o z8KA$@Q?qU@P+NsoG)XUJqvw^jv<+t2!{`C+lK9$jokVxUTZGzqZ_&uwca$zpMcmG| zDE+ap4^ckaJrs?LX&b4XTsFYe<2KmkvLXn)tD!#&P1V2yx921QZ%KULvK)_}8^o}o z+mrh<857LvV|uh_eI;_oiyS za3-<8C<2ogry2CjJ?ja(f!)SqEPd8e@#$JZ&Sr2HzV)TmfU2hv-|su~;ZluO65qyj zFt2siyf>4bu}phezX-|IxT(!R;`~pcZ!>npSGqfBSLPof){9asFH5vX8DulY7v$eru`l;$m5ZnYd|vXJ zpYZq><}BBipD%5XqjTSq?x1jgRG-S5{Q-UdO9}+m42EH{RHI{8X=d<8iQ%F5rYwK@ z>gQ5)qFvQA9e1K%7OyzTcl&>M7%2ol7|}n$TZ1kvJ_Qozo5Y{}|G0?%@Ve0VkA)+H z-X|8m0hxD8=7W)W3jZg5mKE29+GoWZpexG{hOR98ENM3&^;uHRzE`8id+iy6sGiTZ zYV@5!!1KXLtn)+6hjP~Wx$xQglSDb4mZycEF$}*~PDOYxNKx6uA;lV($Oe`^+~&H$ z8xJ!(b$V{(CY8COW+3eFE83mi92-+Cn5h02xe@A0>?E~q-ql9-Qo>CQR}FFb+b_KX2u3w6 z2IlC)N5R?GelT@Ve;%bbWDri2>5=tzC&VzVMz-!}j-oilU+Q%H+%$F0)Umhx*9M|J zidhs(&rc}qi6Wjm_*X^*{z1~QYO#ijzfaTWow3XZ)zc&JRDlxeH(1o;Y$&dQaT$2- zMtQx0M&oO`w21rg&sIrz-mNUt&CX${f4YssM@#ICpy7!%VkICuT}&G83WUS&-HVbp zs$y=Fo(=6=^zreReda>f?eI3I95$6~I%r+I_A*FR0!s(}(@P3yg z$NA4k;aJQ%jW}dy)bY>Bzu5<{S9SH;J*DX|OX+(h5A`SVV?~dzzGaMQD_{5M$ti=L zh2{rd25%+Sd)b*`V=Ted@@c(N5~N>z>b&VCMwC0}w(4SHZru037Y9MLa$89=l@(aZ z=?U(^bZyK(@^i%%`9yg1$1$pw%5U2B!U?TwM^T;{&u=M}zYu?kVD~=quy|`M zxzf&?zyC$8__dtVf2A5$sMme@DrS#0s<3boVZsx!1%%fH_D?Yg92xLVosYmbKzcmo z;^{qv9m=#xHUgk^NeGwOLmgfs-T*coUkFEjFs8Hk6AAa# z@2MrA>2*y!H=enAGFfav6Z5O*zs4c#gxygx!_NJzg#leop(2so;FP_aoAc^lv|P4E z$1ul1W>1kbZCz6`?T?e}LG};#X!93j_VetAG%a-|U+@!7z-5_#WwPff&9d2G-r>PP zSlgC4wze}E)?1c@uV98i>tM-(ZGZI9{2z9YG4sLnE^VIr`aKv7_Ih`|IzY%@%F~KT zFHEU7)5Sq19W3^~N#p!^khNZ$e|liTtCN3kd727@%#JrHl@sky+IZQWuq6fW8B%IU zPSe>d^fZt>48HZ=x9KRZX=;6m!{j)SGwn5dB((pwmP66gCL{w#@$ zNaA&oI5H%@0cxMjyCw6%Sa}L${45#QMaCPDelT<&X`lViZty>SmL+FD@|V<)n2y0+ z(pD$3S4hI4R|SXH4LJbAz;mZkr)*&3YpDx+j+;W;9V1c#e`c6JiZy7F#YS+|mW!@a z5*{$70Jg*idq61>)wR;PUuko;PHV&QYO36c>5HG$J~2htJ>V4=w?^|=Wm9yB7wdFO z$YXzK2tT>tU-b5};zp!8N4I0_blS~3x}K1B+cBy6{$s|w%guFhcV1Gt>_QaP>7ST4 z<1~90??{5g$G$Yk(hM~=*{Ti&Dx-ec)mqW2E~{KzU%?hbRfFQaa5 z?h^)FC*BH$7dpaC66235<;-DhhGFobKwEe+M|&4gafW{!QwKxN=72FlUWJu12eG1G z!=_ubt^mFE6V~p!sc5{V@*nKIuw!FJPx$M~!L)l(%VH!Ehe)Zve%S*1tiTn+_AU|d z?^Wk+8VJD-rTQpXRqVn&Ym^GEc9%m9&yR%@-3@?e7e}Y@1*ET0w0boo(RYx3@_Wry zSKPHUG?wnG1xMEPtFddO0nheNCRP(jA9&BW+fg2fwMl$9hUX%C8vR>Jd#)(f>26&* zTT%s^W&-1RkC#A+$K^pBPXb}ohJ%l92%Z3x`}G%;HV4w49QWPr+#W(RGub9FS6jdo z3f-Z0sO||4r;e!H*Et137kk49AN{?`HE_~tYP4=y(RNj$v{KeZ^N@~L?vIQ>?k z^TR8aK184Y0dPM1D|y#d&7dRu`}Id;B=BW3N-H?Mh4IDGXjiKjv&{2m7pKp|KyY*N zYKx`CLncK$F+}MdhEqzktJ~jf#fDx6X@wkGq-`#I)+63n0;KfD!s0ET5%P9kPM&2C zF|B>F6ZZ^I6(jg=!Tox^V=mQGD6R`sC`g*2cmr5c*wOh|gcqZ)I{LgYkQ1+R9bQv4YL83l_RHP-Ba7`V zT@ymc*+a%I4Q*mI!2YlDm0<1`ChQPBpeQK__bvW8y!pa&8vDZOJtKA3nWlyUdlR#2 znNRE1akK6F#XLN{TKMOT3{>n3>bv1&1%IZ+vRh4iLN3G9^%Zj-aH*ik#sD8*VxK8h z%DCcF)CV)hAFW}RpWvf?o05rrJ8vcCi|fs7b-6hg2s@XZnUVjQNR(?8q`2d4d)&S2 zX}&P$hv(`r(FmfP;vc7vU*^akiCc3JBDM77z;{Al39!UaeBzG+*)O+G>ccR0PuY_8 zW8g@v`s}_5mOZ~@lItk(UT4}w*kbg!s;xcO&B3!g8D*0f@t`Y@y<@n*54DHv{qI=% zEQ?p18Fw#Ct+6`FxEGcJQ&$@mwvK@nqdGv{ILjV|bJIej`_5{I0&MhT%2jPqE zuf?vvmVmn|Ba4~-PGF1O_+`gb7qBl%O;mA{I_OcldGJ@<5?IWqV#a(`%@AL4KOp+H zAV@y8qe*p_9hx7lIqHF549Vx|I|n~_fv~-|M4bfPhRY)={Ek?Aj6a zAJP^ZwxjyH)h!-V@xIMtK_ffsA=}vK#Ky1JKuOH?E$?1dfTJD;^sgS(VBL^Rwkx|S z?ZL~h`Nyy5)8^#UB?@@5Xx39T97p?^v^sg$PObYKU^ceX{b!&mXly%PYi70=6h2Hl zcSf;>7SeX^q#yfgJUrya4nMx{w7eLx;4kJANdEX#^yz8gE6&~p;>XlL(aX}`S0AYm z)N&W zk(>j$NpD}!db5hHZtv`%wLDO_HjzSh!cjKvLoUd#U{xwD9j%Wu_lI6ReL?}8F8%gp z;z1uXG;M!n*%{*Ren0~?=7O(htxF#pTcY8nR9}gq5cYDfuswh^b zK5LpKPQlnS{J!zb(s1LXU;^ix)*iZQ3gOSv#xAmO5n;hEV@-tDMac7IjrAhLzBF_) zm_c?krNh79S>Z>5yNG-+0O}>}QJw<))*nK|&q8;;pZ+MWi#gibq`fifC&SdXZDW5m z4WqxZGf(MQin03*N43^i;g4QqyAt|rakWS$A!p}RCRMIxcsRkf@_EM?Q)#fdIC$5jOE9Lr+grCgxxcX9mjIRiOtxC0oAFI}yaVfLpoA`)uBt{b#Ht=%rhwD1T z&hT4#5%XDcc_IYcCMrS`YxIJunq7%S{#Hae<#^};Y{$5IS>_?)cfu(~nlYO9)1Aw7 zwQ)m95qr4`Yml6uUiolrD-_M&IJ}~UrGHU|FdJfhY?)XfcHN}$!ICc~@UQazxDInS zaIx&^sBqs{XFIJyo2csZrfXlBXUxNw$5xm?&)%*FV`ICagWJYM_8tYW!QkV_zW8}s zyCz%Rt9xb)t#s<0LLBk7Zi_B|ZA?S+qqUi%`2#~%vE&r*T6-uUT5{aqoF#wgkGq9% z4^NF$gEOx1)`9O+IUT1Nqa$&jRo=wTgdc=39 z6j5Sd^F*8$z@x(kcdKVcz_yOF2Z!05z`;%1+APiGAtke|_Ib;5WY?fi@|7Y#8hya^ zJO4Mvr5ODiJ06(Bg;g8+L>A4#+c)zPf7^G$I0r$+EAuPB`_tXJydEcjc}U6x$3PhP zp)(;@A$kxfd~%o0S~t4HBs@(Z?B*7uDuQyZm=a4%bY^{Yh}-7 z>)rUo@{A1Y=4yD0GrGgDjkQ0uyf>}J9J8K0k;LJ)10L;9v8Pce76D{``?AcTx0Bf+G@@*FN$&8J@aBwjstfWK1lE>FpU-jxe?Acvm>oE z9O2J0W$k$gTttA{^?C1y_(dpmX7-+)XuW#Q(qvq4%a~8RDDM`|Y&cHjgF)q_OXVm} z0s5pLCgNuST{w36g>OF@l(LFv<1Z%}B_bZab9a@o(G*#g1g=eZ=f^)Tgxw&b;Fl?( z&vMD<*${Hp?0EcSK8k0UuTJ+esx72BD7*Q6{F}whwfj7;&1%k65$T2@_t44^SJ|wbQ9&DzJy*xQDuJ^jW;&cUn;Y(1bllm z7T>%GC(0j>uS&%|#i+jhZo!~rQ>oU>SQt@G|H*j>ryCE}XASxTxsu6}Zu&nx3b=XU zdNTEr3L{AGp%~UaEINYb{pq&a5^4P0nuXn*Wo6O_oe-x_F{H!uDA;Wx29Z~q=4 z?8y{8o!SsASxs8dY5i{axUDzp)=f8Pr!mm?`Ah(6kJ5HZCX}d0q0B5Y>N1=|r#bZ@ z$tzCs-J-w#$HPeSgZ<;JA^8+YoNp3;mc&Iw_mOyAl>cyKSoj8H-YuCAM&>Dy@w4bY zGOmk^Hz56BEc+~JHz4&{QqE3%yZaqKGzQ0d<~*|f{lL{=gR=FJFKOIT)gC8)D1+bk z9Re1kP(Gnp|HHZ7)u5N6o6c+TfLXlKUQ;{Pm)c7yf1pzMmezAb(0=DpcC;>Sd34cH z#^|ozhUQcPs0&O7@x{Lv(|ZX$!b)n3AllEADv-m5dq2ANBbeV6{GusuZ8MSr1#|EB zT>XRM{8q=xrf{?)J)^5%Ubhpy2f-bq2oE~lcH6wvBW)m;u(+kKP#IQ-G&K)Mh=B7; zT@}B*r$9dLmYDOxS3sZAXwt#@Y~Zxi)=^?K3^;8bQt7jB0W=@3xSMv@fFIP~UjFD4 zgRv+6IGK z=8?lN<V&u|E5vZuZ#Y<^ZtFSXPx9q^__0a zh}h7nafK+x1;GhgiJglM`1((~_SsMoL~xJbZqU8ad*&%hz;> z%AuHi74g>4#PpqP5T63GB}$+M;e6A+wFep^{8_ZVT5p<@*AeZjpKI8NuFe%}r<8Gt zW)A}20FM58^$g|Rg0Q~kEOgRhH6Fs1fQT?fvlkRZX$}emdcSiG{ z89Mwv0$biyJxgtrg((Lo>~pby{;(Y8W^B{5qm_y+d*M;zb-5SnSo)~o&iygi)NaiX zikv?DVE9G6TbkvMp^uwcVk0&mqIcD+K??otVxv(gI$tJU@Y!pNSE`qX$y`ta({D|> z&lPV4;!oaIy|5$trx*?`NhRiwZezIV>W?uv1=s^x_KEd5>O| z@6M{1k2zvvC7sy*MP~ru?3*HgyZ`ZuBQx(mzT5wJ7)gFGlDCHBQy_7^N&H#V|36$r zB)AB?F7&&Ez>y*G4amG(G9Qf0Q$Y17toYgg;=0gJR=feavixAAeU`KvkoqhsXSeNo zb6LG>3?@7sP2^6jXI|c;`DAY;7p&9am6X9gGxKMst$*+$ z5xM@)-%Po{+U(sKcemBT*?fs*HbDc_YW6Qe$F))Z!^ryWn`4OmR=AYnvk#v&R!Cew znFE|D0+qjmT!{T7#m>GTyWvuvXZ*k&Y!6K=4M+UDLEq8`e{OTcxM%m=zg1ETsR7%T zNEKZGbgG(APGl__2fbUyiIBI6J)5jtv6QDg>Bg}o@S0B-WAoPt=-kJ#>><~ANI5fR zU>zEQ_kVa#`DBFRbChZmBLW<7$@s-Cg`Jnc%@9$gB?qhEAGNaBciCm&Xq1@3(zxsJ z*Qv0aNiQd8BeiPr=SpYzgx_}S>svwaX5Y@m*U|BC)3YrbP<^4Sue_d*Z^RJCu z6lpk|{JHgkxPLi}(yuIYxw9R|u9&MKR=4BGGxFav)fpwohsfhpCzX(c>!!LYl)nD zBaPo(aocnA#Z6f1vPV~kXX1f$-He@K@+olLG&xbgI3K#+SWv&mxdF>&H>hn>)WK}h zAG|ZYQ3Z6{!)&eRhi19KUvJ^_iDcL#%M6A8lAp3rn|ML+~s?Ctr?H#Qy>QJQyKx z=|SREK;R|AyE7kOX}G##3)a1pyBhTHUBlO(?V?fffD{%U2fVRggvb*I0~c8Niy%dW zl~)5-Q?(nZ2{i2U%VkgH6Kt`{@bz7G4m9u_ZKb>I5;w$5&6a;^)(UBazPy|PLskm@RJ>NmbYAxEPVWYFs`&QiLT`C6S*-A65$wjuiB^E2`$OnTm< zEv}i#RE=nM%l)seG8`sOcqw}bL*7&Pq2SYkp!w)3seQw<%#wM==^6HM>KQNIzuNC6 z7{ZNxSN8ut0pBg(&oaDPDc^FB4Fp{^1mqSvNPGh_@0QF5Bl8r<_*pWpi;OoQ{a~bhmb4p?`Yb7DO?O}E+n_xPMFymftetxX zhkd8t?-=TY2YtI_em=bkuVh$cipX6Cdo?~^iu0sngC5OOTR)tEg{${mydqHr&Ua0H zY2k~+O_gq^XuimS-<%bSPOZuZHS6+zX({Z*vlGSpdRE3k;k?du3$2;>74|aI3Yi#e zF!a6SO_aYadsBaB>Vrg}^M+cvLG>hzF1aE!ndO6X`1sSO6MeCh#XvsHQ6UEGnom!c+VtehXVf2)V;mR%Jt%Dk} zm($`^%ip)i8;;c7cb?I3y=T+quu#@|(Ih%LGI%d@QA)Q>aE}h{A(7I8X%Tj0T$0_c~)1eS9J3=|Na7XQ!(>UJ1C? zsNT5Wa1{imza8HCzM0kjLVqm!Zt(>6SRTbhU*80>x5ouuRo;RJ z9wd607MbE*N$MulAJJHvr+INmU>LS6z}Fb@*+H$KZh~hCJP>_N@PB|?4HX0~Jw#|V zKl#q0c%S}HH%9ypVU+o$Kg7HJ@k>`)2;3uhFutF_;{bhkX+)kl7|=gL` zSoz}j^SFJDE(BwB2Q)5h?_Y`cc(W?uE-H15b^bDY zhqLs8pd~deAIk(`uly>Zv%D){;ui^rU!oE)NJt~R)lv~|YBLFIpD~B78ETgYTIo>H zyfHk(6Xkj5w&wP3rlR?w+_{PkF_sH3B2{ni2gz%NPK|uYB;$0`XQ|yFl6Y~cn5t4IHY^v)T{R^iTj#ArL*EkP8;4>gxJ4aBu3m&)u^Vo3PdB19;6_7+ps7|I@xv!hb)v>z{PyG0Y4Z8-b%lo(p) zHuHxW)zn`mPSw#L*Mewo?6C8)L&W;GSv-A4bybTADg6=!nmORcvuo(oJqaSY;7bT2 znav>0t@IYteW@*PXT&H?&68arW#TcR&u4nlyQxTDPeP_jMwoC?K*qkEuVEb?rmmJ=t&X}0unny z4)OqLwi9=c+J9l*&$ZL}l+ysqE_|2MUUCguGA7Pv7_~q%-fx}D?zBTTw$<-ETDsv= z5evB8>j}(cq$LfmeFC3iydMi=gJQ>@5k=JH$M2yf~GP+l5Gn9&)Gzmkn3n{af1xcPuQz?2}ozHUCl} zF_#j+^>S9PcrF;@Irn{q)(ao3@$AJ3tKj3HQtwI*Rucw#e1!5Ua*-X;cURAlF2d;% zKl}PPe>|quoj`q+mjzP7x(jUl@_@9L)i(>1BBI~zQHPR=_f%{CcoL?uyEdXOJ0Dao z&q(E8T@JEVojQ9Nook$AT~h-6-0)Fv0d^1MnjoIS70?xE}W*aF0M>@=gMe z0~oUf5qaY9w@-zhh@>`n~AEPy-&GOq) z5010^35wYcEIZeSjeA(m(eY0i^ycYX`-HJT{`p&!++#i_+-MP0+ zZM8fHZM@1uwKFQAbBbf)>m@f}jbGqQrhgPMKY3L)JMr32xuxQ^Dd3U+U*UlRnM65d zg^dngD42!694`Y->%Pi+SCz2JgM&hFs=%J6{9cf)nqLE{KNc4s6~*G~yThyv z%sT)&hF;X3*@@z16c+_EjIm3X7_OUmoMDEbNmx&ZJJ35W^5FG$RvsSx&=pqvBxR;Z z6uY}Rb?m%nD$H<_t_j8y!D#Q}qOdksR=*alD2^Qc+d1opNvNzlG?3o_Gs4{B4~mup z$0w>)gA+%IdN)ja0IEk<;eUJpt;auJagy(r%e(zAAMAg53Pk)Y8P`R|8<2i5(mqSt4M=^Kl(QX!(;g0Gz3_UYh~tc2 zHymJ2{V@TRz~HpkLET;@{9V~n3E|Z_aM1Mn{2R%eXueBzZ!k~d1FwX3aVwX@g@+cQ zg(k#zk0SWxSRQ_}p;0@cry8Cf-k9*#;V3S%`fH8w{!r}5o?~L(!HK~3q>$@&qfD@5 zQ>xLU3gWwJ+Pz{rj+nodc7@*9OE$AnM&()XG1s}__(D0j-EfNSR!cRcX9ucVOQQT6 z_=#&ZlIs+neP;xCz3`xEb2X=c1hCx9dP1f380^TModX(C4p#Y@Za>Odm5bIBoKf4>R)Q^AS`u7~5dy4gjm`+hA*WcNfs>F_; zX8S9#`Sey;nCdmA@YfosNWVR{CD@R-PiwAGh#0S-3PT1P9lWzCE6yEqwTj1WUX~0Z zS8Z7Bo=JSq&-^?;y%wVcH(zd+7e)K5@QU-{_Y5q8i$^-l6oxaOJYh6g!MAnoyqg2u z0Jrs-a-~NWkVnYVAvvNJN|iAVimtr`#rr(|R3~1836`mIrwdx(kiv@kUj}zzv7@x& zE4Ci!m##9EVet^g^;mMKEM~&9C(RGYmc(Lm>$aUsvyKHv99OLl`-0+rDQcU0%iOTI zu({$JItY)*#kIq==_>rWxEM6kl7-8BHM5g6v;%U}(eKhLufkoQlmEtsoxx3|Pf1!T zCPMq;c#c$8FjjL_e}&dCio5VU6ahoSKv_Y8zb@H!|TuFgZTn&0x_8pwa_2&DwBx)-0?|MenFO_;ad$k6~f zq-jr=N8N;~yh;qLyAARViP~!P|C3L58>}!dO{1faADjf2H81Z`xJvkck>jal2d)yN zCfx3P0elWpn04qp3qLI`E}kEz<1z}41suz70Ck_csqCg@aMzkof0y6Qz%3@*6R#d< zN9%L=<22n8$y=K8t!8sf?E1Ph)sFh0iSLB>%t}*`@BrtlPgOIp3q;J8S=}-$P+Qyc^^dtT@1W9(wz~JQ;b*2u!%3HfWPsHM9`@e@wl3JXG)h z|38(alu8Im%9@^rN%n+@BnorPg^~(IMMPx}p+!-a`kk5A z`~1G2{<&@)*BocAbLMi*^Ljk)k9%>ckJG_NrfV^iE1uMT$;WpYQO_44{Fl1lOc3fk zh4JbCmn)9v-Qqcncs>}`kCt0QZK-(*)RxNm#`&{27ZK-m;T#!iKPumV+ERVD)RyW8 z!y1}T0oTvsx-MLAfbWCh^;x`bfUjrqd3N2Vtay9*GZ-WLYl<)E1~9t+FWU8%IcleS z;m_mASh&FA^2h6bDdKxova(O|Ng1OI^GblkJBvEk}U>!#n|a7GA3Z-7`X*0q0P7ZRP1aJ zWbO5byHXiIIit|HrltrmubGsSG;cwIRaRF|ALf6E4tv=e7mew{J}p08K980sPmmO` z&%k$UshSZVe|R~1fX7HI8s=XVdXsoQ3h2WYTHT;A^S|}%1vWO?Jli+SXx!QL5Yj!e zv~(a}g7l!$p|2y&Cg=*UK z38NfQnfS&FjD;1zEKdOS)fPbOZ(i*dEKo-+ul?MTUn2)Hzm#=6(Dnz#b8g>kO1!Ag zjh(cdB=x>2i*ubo7U%<;BRpk*3Bi9ncO(a-d7WTiSWn0Ly%6SoB8Am$53!z2QpWl> zY?oWvhp?MBPMn`}gFoJ^=Q?SsL-lP;b(Fnqch114ubaXoqC>NYvs@m=`CL1 zj5YgVHbcz!<}cPdmXrlH?h!EBWtm<*BwmsWjWcfmt&chTx_-%_!cts3 zA%7}iAF^Zpp=B~SoBUVs*TM;O;G@EqHuH42UD808sYn`Gl0N3jO;19Zh5Og&H!@-S zu0t{Oq%v%sd8=SztP1KsD;sa z9IJ9cEB7(x`!4CB^^r@b-so3?7YRwUd}jKBL$ut`t#iq=JUj0_vyCH(N0H&eFWjnk z>`+(fJ&%?ajFC>ReKcP3q5Azaj^`I*42>^GIewbPJsRRT(R5wQ83}3TpYSF*G`wr^9Eo8}3r|u_!7G2S{2c>EX z)nS#A=b^Gf)xdi>@A{hxx*p^u#+>~mqhD!zBd!mU+RCR3MJ@F}xyfWxZ+0|rvDW*3 zLLi;=GE`_s&RZq29ZTGJ*1mE|DM6L-z)N1B)7fTIcH=pj_bewp-zoxgn3y~z5I4Yb zRa)wf@OIRX&5k*akOa@ve{YH(_5>{P6N}n=`Ouu77fu*87C_^dmmZ8GPg_2o+yCtQ z*I)87PX|(KUzw9$-d6cm<7-68-}ka@>w7WMR?#^1-}m#$2jkcxYwolX`>*#Nii~Ch z&ua>mIL|UcW?!)$Nrq|CTPtPvu~j-Wgyp4$oaG?D-_mc_qeUX#Zuz5=G-*bTZ?x1ooLou4bH(wzTRevm&j-VEYw$b; zob!$IXK^kf&g;TCGC1D=_ub-tFx;nr>t}IY7p^zJ_rdV`EM7Oj*R%LM`!Pb>@$mQu zs388RX#cS^bOZI{dH6UM9A@0>+B)(U?(ebhDbk8Xw~MEBY`Go}uBxToosVsY6E8+% z8%jJ-=8&27f#X5o?(X)69-B-wmrI7?D=dw0h6~+@fTgg^xiwZ~MR3zplbNp(t4s17TjOfFIwx8NW*#odvd~eu0t>%Q zC##T5n+$b`#}LZ#ZJfq~y+=uj^j#;b|0^`>t0Oc3~lM>G+HJ;^wIJ(da@6# zRCp(UUr7whsoR*COH@Ye`hvUVrIk>VNSk4^T5VK7>lQN2ngSn94z!$Vl|=mLL-~S( zZISA^3yP0outE?g*NdOr2^*qNUHw_`R|0Q@PvSZzRXYnI94<1yKh7qu}ncfD85rbUOCP?ZI_im z3yW^6SqV2lm3;pf$wv9$aqsGdZTk_l>YosSarp`Ha;l|jx1Scm@RhWG@i7X$n?G*7 zAX*0a^iH;%;pRhoid@@At{Wn8rRi%4-Bt)|8vn0ra^>*T$?;1(**8EH_nMnvZxARt z@h6ZoQXduARR3RQx^#y_8mgEi6IIbd9*w`o$|A^oF!pfF}$9Ia)3Gv6M)dbztkoY4ibc&bO`9zn334t87K{4R4q{Fu*V)A>=FpFXXA zOPEhdP z5_{R&_2M*LDC;|zAKbXFN4}!8Cy&{tg7`K#WUk~L0}KeRlu*4gKwdC;W3my$g9uu8 zz7^i(2$_{1{8~DD5S>qVkDVI_uC*b%L=j!WGEz8d4a6fRj(b{hma}x<=?Hb5-2e|sQS#- zr(}>Wq8WJBjuKzf&!L?WPP#E@V?GfWO%X+zlQ^36$;An-+3fjJL_Aj<&%4EQ81Z~C zJhukVQ@}ajIDZ!BBI3L*oFjwt4RGHr?gztt3b=k2*LC4~1AHG0ug~Ij1AIM;&$G<@ zPcOL`j6yrzla!JeKSaMToad-pIC^d(uZCOr4V>HPOLN`4ganN$G3{QvfGS0?`UYHq zFlxxs>h&v2WHhCM;YmXc2phBDD38j81**;Sme@j7rav>cQD|KM>acCcID2q$S-U!;4bYq zB8COS=zS^;L`JS0gX57h=(W7F=WEKp%1K6texj)DYCSWgb;hFYXS0w>i16FQ{dnvVjB zu3K)poGD2sPgnnAdEo0tv@Fg3J3p|}s_WyPLwH6{s)`$&r?DK%%L-%1MQ)FhhXc*T_}k53*5eWS zB`kkpJaM1CeB@~~K}U)|@pcXr*tz(|(Yy*$4AR)doXwE^thw1e+PC30`{U^o+cM$S zUDKbu|7arbFEF{QR20C5_`K;J88OtGWj^6cn=JAp z=D?hi53e-NsvsLKc5x#82T*h0*)B=_Ishmkk-GeOkX2#6J7%^Xf;bOh@l9(aJ0%Of zQ=*Sf>ydIl=HCay`=7k!2&#vN`vslf9lZ-@&S{jmi5sA@k+%dBuAD`pAK$d9li^2> z?lY^n`O+1>6lc3K4J$wZovQOkW*hh?F`Aao9Kgu8)IoGW@?SN@^ZqbHubuc=XQFlv zQQfmZ%m1+Z?oQ*3&6ICQc49qnz|ByzG56Y{b zk2FBLk|(MDMfjVolj^I1?N8>YdT$^MQmHybn3A?a)yKf-*uML%WaC!fT|N5*UmLbo z?8=Nm2C7@_|I!(r*FH)~ARcn|_MN1}G>{qY{xyO8j_X{%NeS?18++He>{5!+HK_P~ z7m=vY4?a1%JRqsO-KDxOPzp+aQ%p`!e?lB3A}cm?YCvbCq;SnQ5=K6eT{jB-N38A@ zyja%6uqJ>an`|UVxN8CSD%mD(8lx!joegue8GX2qTSr(Kuj z5!v8)#%v{!K>B@BqWhj#rY)_MsLLF3J$ouic&<2}cU%0w97a4J z49~5>^AvE-H_o5MxrjKg3+Kq-d;{Efi~GTFp8~F*#dTe{-T>bR!|Stn-2h+D;`8j) z*wsNt+fm5)zxC{vTf^suW6mS2gBva-2_k-Bl08v3^5F9YnWNvVE%-}rnmoASi|1_Rr$1aZAa{_^gP8>H;=D)SnG z9()&Zk5X9lgvc@<&Rbxn4=fA%M{YIC$Ojn^lb#<}Y>B^=r2Q5n2uM}6%-r#;RPcCk z_ST-$S*>l2iurXiaYUW!{3Ua{SaSP}=ef!8kCc_>mzNCA{vd&)&PDnOpD6F7nyMHH zALLi$L`BU;zqN$!yqVKP#Qy5jDM)R0ZMF&>eK(vcqDoN~&kScsP$a<=$_&GA0gBs+ zon2Q78$t6Jy`sv|hhTxo>wWh-1+-P|*FMJA2?_?C?%I^R0@X`x_kDKuffv#aiU$du zfKfZ4O~v^M7-6{?_2y_i+7a|!GG+7u6tuouo0!}UQc33Aave@6BWk;GR-6L6W1o@4 zR8Y)MpI!Lkg)#E%(Y?j(XB*+41E0Uibfp2zM)z#7*A|(L->c_WQVdlUt5~Geeb9?B z-0zy24AH^M95o){nea~wm$%u0JJ1ELhDjxs1IZ7kXU;2GqG_hgQ}VaNP_ZgvKSOLb z^fvAvSYK{|>vx_xE@orgNF2v~HU>0U5{CG@e!zDemrr;$!iR+%m=r?cKZjp`GJ>qfFv|01wYCs2Jg z;P2#Fs@@x}1@KXIh;SfO_ET`~1nI6S*N0k$Ns`g2-jE`vWm0z1$zjtIyMeL#rIBC; z6_8YNu2RP>6bN3j=Q+5)5H!Dr?RCgeZ2TFJt2xzV4!WSV^<8rC*HRDr7SQdIn3fmixQ1!T>Wc|;>@T{5x?aq zX9FY*-FMK~d`u)%y1#PL=p}cRboV(k^8-=M$Hl4p4#PO>A0lT;e~_?xneLHHd&`vjRuO`yd7hlkF=?>7j zGk)b-3C7U}60RYef>^&%o1Owpe;;Y~Lh#Bo#;1CIWslJ!IU{Ox(ES<_^2|?Uz(jfA zcI&!|5+4Wbc`Pz&EcB6j9kg6=tn5n5yTx-D@q92mw+7Es_&=O)8h;jhIgN{m^SW@3 z49+*eeYdzD4EHJE`dM7ph3gIQeK5Q}i`NbC^(;QmHVu3~@u++le)IKw$m5Rjgj@9z zH%xDUZ&0L8|ap;z*-edb8eBGwXzlFC<$f_L17p97wHvbfxV|B_z5ci7Sl z-Y4@`WxszzH0ZS7>F1e%aQ0=N{UaKVTzJ!ptbgFOAJ#!80!vU zFmfn`R|XDJbzP*8BM+#017L}IQ1`)rkzyjXJ`0Fv_R{JGPu|)7(zoL$3(bho=GodV z#6^)~Cw8At?d!6vAyXQ|r{v`J!GzJ5KqJi$JI-UuGrkwSxS%ZPV-XI83yz&K`Kmz{ zVP$VfyRT0i5cJuh`QsKz?)OpS$DPd-nX&ggHm()e{U3VE0I1wy!_;%((?kg?)<2tD zWB{|}{E;p*6<8q<*hz%W?JZl=Ou9;aa63(pR~e4`IB^M3#jv=Qa9O;&#^tjiw|?I8(vlB?#Nx`?_r}`Xbmax3cy~3@U4T_Osuvc8s6KwCg!tBzC+wLu7jt>N1ip zvV9Z#d=mN+d}}TuzWuk2zm`|P#?xt6i<#5G-@)LKJyS&Fjg5Kr&lYQBwboDBm12lm z`;PPmX12p;tmW=8Z!3Wi``{Hm;X??A-+i_*)D!VJ$C+lmVud!&&c^Gqb-~ZQyl1(O zQ1=xG$?oAp*yoWI(ffB9Aw;fQ?|tNnwipUs<#4QqEcp!ce^`paC7Y8QS^QDJYe0w@ zwemp4Z?aO@4(lQ8@aCX|xC5due8Pe=(-+NL=JXM~{1}e@9;f9qSBm_g<%ULHSDQ$> zYL61ie*H0cXp0iUA2kJSQAUgBdWm-n{g8)^xl~>Iftgt{+dUcE~iLg<>(4pdOHUQq6o9(H_eEuxwg>mT{eLRSkNnt5jDB@&GtCnTL@ zSHbRMo8OCGd-OF?X~(%9>i+O78@rDmqKZaIRM#$3_@sWdiWhFfa-VX)s4dhG33ff$ z?shG9JtTUaYh~o%zuYy}QPMD_q9{N;R109+>5Tp{R9{o%qp$?aKg4fhZL#COvU|iRr|S&Wt7jpeEB^oTZvV?+#IA>y z4@P|*HMa(9X?Y4b=NsqG;#|c4$Lso!BZFNhjcYZZtm_b{?8f0oTvsx-MLA zfbWCh^;x`bfUjrqd3KfC``?+K0T{V&ZY$FW_49R&XF|~q`L%Em$Z2(B?|&np&)OB) zt+7ieKfDg}V~yX`!t$>V`AIB?#)Kelg^zM(|M{YS1m{%UPTL|fV+TE}r|UsUK%cq( zQUN3sUlblO#r6@4B^FYiC7{8@{C`dsq$2H24!a#U3!v3u^}7z!nOHu9`-{~};ox^e z_qd3=2dZoB_fK0c6glwP;OBCH4^lSTZz@O$LI@h{aq`7`u<`eiI&{8)dOtkqX1J0m^S-M5%VTAdf4~wN&)@d1RBSFKo+oXz3*`hFn}F z8@Es-KEDWNKHEi73aoUr8*Zi$ytvzWm+dfpSh|q+(L`e8s{$Fl7g5;j1g`KZpCj^@ z2>NczDTKk{lHNx8MerUcV@t@`ZTK!#Ce`a*4XpUuA2R5R?Gw<{FM`2(Dgjj=ZRO z56-)&R_%H50PgizbT{>LM+$&$OJ!3bv}v&l&H6)uZ5lCQd;SC=$9G^G=j#t(ej(q7 zw2Ft=``^*laKsB?-uWkuMXDKEo;h`B{BkLFyzMF>+7j{gQVJ-SyARjz=kd2zcp>G{ z;$DtFd=NS2?q@MKOF)j>dfM@kEZC?h<^ET?81AFX7f8+yMuZ|Z)Z~A;B72KMM2O7~ z!TQZp6l2i_7`yk`HgBq9|9LfJgXMhVLGuXY{r8D4vYf_9ROj8U=)zd=PP*sXWY`@b9;uc4E-eev zBNau*X0Z8XqgKg+-4G#&uC>CDAe6AXL9*{tC^Gu8i+EjR!$~}S=vX)dH2jKW|mFg1* zmFjs^|03jndXnm^fsO?MNvRb-D9j4Bx;Y1zDK6Z34J}dIpzDEcC#+1l;MXxN#`Ap! zkhJIB0m-r$C|VQy2}qT|$Cv+(zKoH<#wXHob2OVQW4@PMhpL0nMuU}@+rHRw0#B3w zOQQFhK4t%4G-&7>m-pI6olgl*pQr&;JsHtA@2Ld)SN6}AG|_ZoUl&u9(e@>MZ~daX zfKcrH)5z}=nV_Gg^l$hvadlbUROOi%T)CNHD04yrgnuKYykt2*J)h^`^fnax93_cc z>_(?lza8si^@E>-lWip0eSqI((X*nu`qX|`o^_nT`VqFPOGzTaN4did#VX)W+b1#p zF}Fd!_W`Rri?LY0vB*$i>bx3xQAl8(h^<@7mE#799_r5GE8+miT`E+YI(-H%ZgM*9 zIw=Pl{`@n{TxOx3-&rwSoO=Fkr++G8&&z+g;&|RIp2LXegWV(8-Qy6ThC(i z?DNBKSvfq^ZfWr~OV zAwlKmv4Z3AC_zZHN^~+E6=2vY=0GZdx=Ldf^TiKl7RT!UVT5gBNLpjgUu;VvH_RtooLm!H{ecB!2zZUH}Gzzm+DiXC=Jo{ zv+(vob*iolPNQ$AdIRXKbc41JX6Qi@jHr?z+OkX2>IOGn{O#e0qJ@^Wr?1k~T~}WpeV(ODF~%d%oTk%~At%X%^K>$)~aN zKd1_6O&~^U%@nCFW|IkaPTQ<+M-xLz=89Zc1}L5Z_oDnitxzV@!lYXhR!Nx!vAU_| zlN6_cf@3<@+Cb2|TT#5rZJ=kAMdjI%E^u-vmh7PQ5)7$0WLwD%fI`--Eroo8ATVNp zNjstsbQuqx+!kL3^7w=9Ct&-RdgET46A=~gwOR9{JH40>ndf7YZ)zx7^7ad-)5Cfw zS>5xoKq?tv;}ER)B?{>%_;SlewHB6r5o6dgT}0Jyz+G<=5uq5a-CIuIgnx+~{XoEVsV^z$x53}% z-J_BLdj6Ngka{ZgWa+5ZeMQxc5W;_JQT08M4()o^(9MYhCAShQz>dxIQ!_h)q1Ca? z#c+;V_*6*evR0@fn!0O#%ZK)8l+C-j=~7h+>d%?+)A_Uyc;os?!9>LwNSW8yGWKRH*${m7%iXqT7nWSH+1(GPgHxko@SK8(ua z0H%KV7cieVsoIs3>R*KE`MU27U-^S?ceCx?&jbMTU2Eb;EG`3)!9$;t-^2k(J|o}# z^E%*T-xs%)y9|&ljtGR-)`O@h-D2CL4%mHZW71#YM-M)4OWRCIg7>x7>I;AQQ12Jv znkoZw?US5t&Dk|$r+jSSKnfu zhJBA|6WleeCV}ymk=MB<$I)*FeAfKJ-r%{Ta?^ElW0=6^G4!&S=C|0l_cF$1#-5MP zTe7GsSN2#!W)?`*{++z}$QK4*-MV)~-U1tE>`GIFsr?ACKnnTPW0_>&+zkBc&kvV) zheP$IKJ;=MP2YA{>)TPRpO0dSGL;Kg{0STvKo#sv1@p|kpqDeTn|{Fz%<%4b!B4LW z4#}R`u^ui3jBEIJEVR!krh_VSD+4KRvjO z-RHP594GlY2%hX!Y5u_r3qC8xT0HY(J8R!~f9c*^8m9 z%&b@)0(-xN?{N~y_qq3vjuA@WR};~DmQ}yVln<+>zgD&&%5LfFx6hM6a_M|e_l5~1 ze-jsLIjK&4U0bIMBlZ23R=I3JZ%hH_RZP!u{fH$O-|-~)ny2U2u~DpUrl{5x&B=~N z#&RE+mEDis@8VlqWLAmQzkKM6wN=k_NP@uzpI*1(4d0c(e$QF@Zt#-S&^&)n@$c`{iPwYAf`aJTPsCqH8 z#vpV+_6bFVZ)0tQT}+q8eYEYdeqwD2QAX&I?uenJnI~A?Ac60#8})k)VME*@6=fAt zY?lAqMZNz#97Qq6cTC4gc$kWw@cJUzYT6D(vH!<+Ttr-*AM1Nxxd9pWs$VK&hy{gb z_3pClBq5VWw+?P1c8E!Q7USKl3+VTOr=5-M4G`Iu<(^tz00?7c>BZZy@4XZ3er}Gw z2*D!M|Bc=S)a8Ja`A|$d{P@HjDt|Ahjw{a^-6iUE>4&knQQKQTrDKs+9P(?rI0{+2G7xdGEbu93xVoA^3GV=Z)c zYLBUa&n9u`t>Ldi_I`5g=L{@_XPRs;w$2%M4$Ir%nU=PR--7)~!G>yI8cO&XO$ zwcZDF_H15*Mu8W1{>JJr)ZbqvZ%6yNn3msvt%DWPXZ%;k@3j)V5GOOIv)dNf?)BZD zEAN0^Z{%URR528L8AFlk4E4`LWJM2FSHid~8tFm^^-45|_VS!l;f z?AJ`}br><~8Rj3v{!w$qsVz0{mfBKt81Z~CJhukVQ}{oeZyJ9V>rUe$Vl9o=g|#$} z4A#>42DtAQ_k&@_X+8y9Ka1P(4~Exg|4-e3ww}daPn&1yeZ&5JFYAF@s3&A% zL+6+;uYHOy0k1YQ`gI&q;Y|vQNaqfHWIZU2n>oo3J$rS=wZYaGX$!pBV(|O{u-|jR z;mhkn>Nw=C@29T&dHlHs6H$Vq(FgYU>&WRtFG6Qu<-#9jR)OoP*MNx8-j`7UL4Z&{ z(RbqrR!>8Jzll_LMl?S+ahG8Az}YuVue5z#(MVr!euIsA>iFm0`b^E4BoM}*tD_&V z=ZSpjBHFYyTe_sAg{X1m1Wj+?vvT?R4&`oYKiQPa?}>Y&@w$QHE9SaZ>UjukhAODF z#Cw&ql|j_&E!)qch*-;NJ@5K>0y0;r$w*xg28#O*u}OyCCktqFraB%RAV)^nbAaTR z#FWy#71M7W;Dace>Y!}u^GX<#UZQ;-o?h?RO1FxWG&5wO?!OtQF45*+VPNX9?FudMcH}2l_W9e;?o{k2 z@!t{1E4e*qkh3YP)qDel(Q=zPtz+tVn_UIJ~%v~EQ>c(<9C)ju&OdUOmK9b8n(Dxx6 zk$SqgcFHRdp)1?^%`jZGTvfFCK;+9b19bIASji#vS|A^5e*8K8UFvue zwB-U&VrFAT${`fAz5XPjai|(jW1o|V+uR9r z#~$W;!fnH6T>eA^gNEsp*W9L5zX*XK&O!4hM4q$PK@NS8>v?@63+~%9_u1=EE?`oA zP#teY^XU>8_(Q4pkzjV>A~N`)r21`gJ?t%%xXzF1MLF14cbL@Cd~u^S`Z`#@NU?Bh z3k2(ae13)pm1S@-;u5t4>VJdp?OFAOAihaJY1j^!i)2*_hf2||gRoVTcK#pPw7eUF z>H}M(XrMmuk97{%kkHc5HY|X6t~j1|OT8Xy4kNXt=7UjNYHkhI(DD>;&Nt4V#kq($ zuM6kM;CutzcZ>VMaGwIMpT%`uxZVKY2gB>Lc-;VB&*JlJ4Cm(YzmhLuiuRK~Qy#A9 zQgzy*?#^KJ`NST%$k|k=6}omN_Hzj+-Ya0V$Jq@V55-?TA5*aR5m_mBBp2OtTjW%) zTM^W-Zu;~VX#`T+T%42iQemV8kxSYj2#I_eT_(9T3d`&2kL0@=gq)#FdXEvW0)3H} zgI9bq0LS8s2;qS&sIPSB(eG^@AR}ys$~S;)`e9VxEw?iLf|rz5?u!+upaIA8ekmMtKp)0P z8VQ$JB9y^xjR&jSL6c*j+0o%zC?-AKf0s`QwK&1Qd)taLdfH4%WWx3WVv(&@TgKY} zo(i)#JyI)&#TBS3=cWguxZ9j7(13_sGJH0ip%R2X&*lDj%&Q5AS^q1@{8$G)r{ewEAW2)4d!V!L;0@t zyhFK0@Ybu7X-wbrk*%L%juk)hM`jq^Z^!M^K&G~=FiISAMfFpRXt|-P<>RzGJCOw) zTF%mT6+K%1$Aj|5p7gv!s595c@xIpxl4X2RH~o?xs(WN2*<{23nQK`z5A?M`Eo#U! zTjjUQ6S`UFZDn}mrOd9X$4`XGgO!V0#zrdT<(YWcx?We1=g%Ffo~zg-haP+CwbCz! zcI>(n9Y`=FkN3u`au}qOJ|DEd?l zK~v-nnWu(e^evI_?Csg_wGm{;L%a24TTH-d1s%y6W0IG%Tl=P=^=V0dl~o~M9wzH$C6&PBv|T{uSu=NsU@Tig$Z`xJ2fEUxRq z^#=Gp7+#;n>jwCG7N2MN7Jmu(+9Ba&^IOrD2FHax z#8_lCVLUv+rw7gvPMiw=6a~hLSWe86Ly^N4n~?|3UO=>`^EoQ&jFEe7@{Vyk>%r+m z#xsBXZbDIw7vEnBWWag7RxMqEB~o02O7J)PBj8qI&!(6#@x#RZX6$7(5GGF;$f|0ZYy7P*wLC%a^i>*o5S4=;vaB=Gu;$2P!`My2{ z!LlP>NaXY2pq65LY79aNiqI!=CDr;~9Nze|h1dOC|^+e?z>$U~;&{I`_6t6P`@ zdMe59J};*WdbJRz6Iu=be5)k$&&VbpRXa{%wmEhoHbkBhBAanKTuFkYENnOV=EoQL zIISh+zJt8v3o%W?TuJQ2PqC(wx>r|Plns8)PUtj|lP^zeGF}`fUm^1%?p$oht461i z;h;?-^M!rlwyau6+CvpOS!|z^soSSYR?-m}Y|0`pdZ{6S$7|ve&tHWB9pO`~E$WD# zVr-RV|2b@8ARJSWD2DBqy!vmEby1f%rlaRN3=k7j@4@&x*^sZU_p*kv9}415w+ntc zhtx6GXgZEY!rW(m?~I;Up>{{20;JQkU}(H=s+rVQFujNQ@lIEIv|EYG;ohPqBD-$z zE7(dL^;0gf@UpxIpG7oTZ~K@4=}PHZ_Zu1@+MMXrs;ClDchdGyv$!JSqgS|-Bvb)M z-tu>ToG*Z`Ba6IsGRDZnOyi*)T7l>+TghF2PkCgm|KXcuag3vuFlS@?zzrUY-tqFX z?{#20>1d-%Z;HAHJUM#XlnLcor8E8!rj39?DW61UMRbqg8l5p#SJ_J3VUopN0b5JR z`M&v8AT&d<&@{>%x$AVMfi#ZTe*pTh&5ZKJ%_C5wzUUg&kwG zJUgqwy|kPqMUZ^6p;iVhF6zquz-5hQmnIRv25m?0KfgT}qH-8Xo;l+a;(i#t)7tdD zjUwKn;(ziDIEaxG3BJ7>yvZ$^q=~>5wJ3RiSX{*Lt(3@cTJBlfVOB%#b)I2g2@feUE~qJ}yJ98}f?X%34i{Pg~xCg{}t57FfGidH~I21|9}V!^`XA z#J(vWNzKqUvfanm1Cm9z$TK0|w;DfpgIcJC`#gOp81}wdX}2bU_~&Gpp18IJWq100 zx=wT#vLJt#QSp8rWk`HQ=Df#4(lW~*f8TO4rR{xXg6lvmS$=A&YtKhdqR)8c@@kwS zkr1c9!uU~uoa?MFapIOdY?At`Z6;y@Qo~-(PW`ih@rfv7>MSp6#lcltM!!KE8|<~_ z-qJ&E3+Cm@@E;`Ox#D=sfrBz56{W(`jW0{gmyO7GasnRO=` z9sl1vYxXrxerxA*_>nUY!z&Dteu^4y54 ztrb+g0flWPJR_aopX~gX^~JWu-^89U4Qky0G6=+WGU$INAD)|~&9f;#zBlihD^=+J zYngVoGgMyDgEyd}Gp6O6Qp4`)S;1y!w;!iHc~)ag3LlNM&TsEbuZ z;-`W?-W5%TA0u@$4#!9$UyfEtsF8>WlWM1O|1D)GMd!Xs;jM&1H-s!7R$c?COmr09 zGD}3CKPYm4r5dU^Emd0ZJ{EMG;m}(fXokH6pUA^vzOamoIdGqk0kT`kK;WXi8me-? zApM5wNqFY^P6xA)YM6Xr^}yll#nAb<Jsw6rL9-=t9xqTRwnnl^_Dt#?BJ-ti3wNa)0g6VBKGG|4%50VBLR}= z_kZRqELr-P??IfFv-JFt1ah%R`Y5 zTl?>lyNvH`wpOI-37tM*P4Sl97qH5*K^c0y{-G|!1l9>%QW6aGfS=r!%^3opQ6e{f zcA*?{#88L#PP=LcNvvPeSnEAh;Fv&+Azcuey!q;^ac|05Akg{6(d2Ci-;Y%>XhgCWPg=}w3sb1K@s%iAx&NaL%f>rTI0YD0a7^1Ow}?}J4# z!bu+GxZmqbD(?9t8J8$JwfM^v#n7zCBCcYxk8e7}}2Va7{KRktT`yj~zvM*!-dNjwJr&ghUX9?sPDT@r9A&nykud z(&T+Ziatta=ET&_wQ`qreY&B-jE&}xV})=n5LjI4>K7I-;iEDW7fC<>nLqT z10h}$3uMB@3-M+Z48BjdT^c-}3Z!-(gD;kh+5+YkA{7m=-Y ztGm-M-~3vaO`!g>7qIG>82N3TJCNSsUn*Ej0htV6=iRROA_MhqfgXpj{gUrH!xXl~ zq2@ooxf&JbLt7c0Zt1JFKzrAmL(E+*pyMOF@Jb{Q62UZ$ix~0k42{=y#G9U*%8{vW zdQRgT_{x8C!+f``;}t3P1)Rs^>x^FMVm<|hu}wd!ewN~6&gHYSLxOloMvJO9Ae~Fj zqV9u1F%G4Q`i3sztKLy+-2iMmRMnDw#8I9+8`$VJccEoMcEec7iqcBRiXCcA9ahMj z`*-gi+w;~o+3!Wuz=x<+`l)IyDu&!q@@vBTGMQpBu+OyIZ<$n-JKOXA(hQ}e#v@+y z>yrHY^0@5C_$}n{!vZ1{!gu-dmle&GDaw?l5Xu8>dK*f9u(jUZO}j<*G7p+UBu? z=aEvd$bV8QJI@C7+G`rzc=s0gr=VeTT`dymP1Pw+?a@bPC5AuUceo0M1$2VJ${Dow zpG+4?-V(L8KhBU+Q3*Fx78eK~grS6{)-yHtd%$QpLcXis9IcFuyIg!y7j4}0#8YLg z849#PgKY;Yfxw)7U)}L0V1Ls(!3J?avE?hVT;C3fPSLoK1gfAUzoAU^mP)|<*O-1A z$915nz}g&5Ifqt>-s{Qy?1M5%6^!1^<3`z1eGi&gM1p;V27J7sO^}x_SFWow5*%?S zUOsZH4$KRj)=#%Thq`#TVTpQHi2SpcZ90rD=u_(_#izmpVdAi!$og*-c9!0Ie95C6 zT)cdKN`t)sL?ZpK+pPQ$uh8g;(o=3InGh%bgzSkbe@#&6e&m1**#4yD*^OpQ(Q=m5 zdBbS=AGHTLXqzQN=TwllYr z4kk#T$>ypV4i*{kifWMV68oCGM=7s?Ul)nEImW%T_@t23TqJ7crg(#Ld2sRblvMyJ z`>(P=5DsCZi-3~T~$|53CYg4*L@%H9ofXM* zD5^c)U4QGIKin^vrqKRA00_){et&(`1YRBfAVe>43=ryu-v3D@kHUO9o#~s|apac0rQkPp}@4aY7RQH_vuSnVgi*?}5t7 z3On075`Y^y=|VA`Ju0f_yjWq7jxO$cp(Z`ygZ8oOjdRmiKnc^GH5)ez;o9>yWykHc zpwQvOk86Vda9e2tjX(SLr4@~fm@w{6<8@UwRna&yvo(L*Fus9;i2YJcA%k3tzJ>7^ z<_BwC&Eco|6i9^1DcSUArep);A63^yx!!!hGLu9yW)| zEwAlI{pPu1?Vha||XD}b^#p@3~ zRblJM=pUcfk{-dGf(&2ATI->0u*R3KbH%VT_j2&vie|`~$i^6UM;!?+xAAhq^4JN_ z-yV7A^$0d-%n7Nr97XDu;MTBr_kfV%ji~a3blBVQYhNrUmitVO24W{~0>4yF`{o?MCD1K#FNQzpub|BS{{8AsbSrEdb`mI`5#A`{W($wSPFe>#h`Q#9Jy5Y zBR#PWN**cYm@zw#2DZF^5FcuX1nw$Q{d>y^PCjn|c14fC;)PFIx1$=Ms`ZB(s)33~ zyG8tOt9b>qKJRVg$*f}7O|ZBr;-3LvZi?ZlqF8A0Y;&Y;*$*X!@ap%!azXC)oEN%p zaRCt_Zq8VW+=6Msr`G;Cq(IechS$!f-Ulz3w^q(i)xy8EuidiG1fmA)+PN=Z+aq#M z!tQr=mGnwpSYS5qFl<;i2<1NVfmn6!_zAicMX>yJP`IO5 z8(#UfGhk-~4`RKeNa^ghNFd5su2*Tv4P-9P?e^OtjQqSuubD293YcYXNZSrfx5({Y z)?V+}O=eQ{y)YgYMIp@^|7C!Iq*AH)p0UJ^htp>XTBLZr;*`~dvg$R*0z0U-0>+%Dm49Vzts~;=JTS@!ZEi= zOMiX2!@7ehyLgNkJQAje8D#Uy>>fIF@0H@)ik(bImhit);(jL}m@0Hr6PiHXT#MOT zJH>!g-2O8`w)C*EI%=YTc!?N4rnFzvrIIYL?mjA`{N(@7702^#@f=1x9}Lf}!SfVw z&Nt4V#kq($uM6kM;CutzcZ>VMaGwIMpT%`uxZVKY2gB>Lc-;VB&*JlJ`>kO%ZSs5A zC2BKIcSL|F@pS=h;FX+m}k+nC@X{de7l&38;Zd>6%BM8PdFG8`1No z3z{p`JjAUT3~c($=eBd?Kqk2vtWk`w9ef^y0FJM<+5{^=9+Qqd&wAQWv{5T*|6=bng_W7GFi^WH zFRgO)w)pFPAN?iRyJF%I@#t z8GoPhAb+OXs@amok%awi&E*Hg&@0bb+mP4psNw_Un4}@L->&$!ir<{7yJ@TT*l^-~Krdm+M^TI_q^`=k~@4~lN;>;m~Cv))D9Y|-`UjpraIuk){CAe7rOxBk;ysR#iMp?xnzq6 z?r@9WJ0PbPwbOcjG)5BM@uTIhjcY+q_?*4DPchPuy`Eye!I)S~P%D(%;6MO24gT`!00*bT*PIb-gferR^B(k=;J$Ma8fF=Rg!$kK$;|9fe69UCZH3upfO$wZ zv>m8J^~H|2dfS*dw_C|Szek{XbQ|5d89`)St*fpmE(E(Sh)liyeg(#a_eKcB>cIJ5 zq1~}zk$Aud8J(Jf!+7E^@q=jgyUDA9*J)*C;%$aH_V&EYG8uFb$go4 zd4O37^l!Y5a7wg#&2JkJj$-}CYPPXR9OJA0+>@_KcMf-7*R`aLVX++(U#DoKW8zY<*rO^`=JrFICleRe z%Dv>|3(DaKWgP8A6miYUzML{VL&ed4ttQ{aiKXYh zdSfS(P&+o>GlhJd(4W$^y?B|&vp7|e3(R&=J!NZRvq@eeM~66|Bz~~BtVg^GTIl7D zh!HxbbY`#d%W1^JAzsw;+@##Rs&Vf|pB}(Cdf)9je+r_*2n-}CFy|>gV2yl+&>vS` z8s%Qmw|gTCeqhy=ADKIpa#25Qb<`VIWk)JAPY~Cyb2ExO&bBg8L6O+C!vO z;4cK2=gvptW2GGWTb48X#s1w^fyUjUe5rJfj-F4!C+EJB2 zH?DR9yl%;UFq-VdJRj0mob8J3Y&i8-%v;TAvNxd#oM~2ilAbGbW zAM8J#!vFBIB(96Z8<2G{GCxb^4aoH@8E1o)d)Q?D^h1kG+ondOCyiw0<%T!vAiT+* z(WiIL;h3G$$I2gPi18Iq7WtMpqWoj0Sox1aICMQujD{rYCoiQ8J=;#hReHWH)S3ws zJ@@y0W6)0EAzrc;Cw8UbGOt{=FIKpq>-g$z_rx3K^`uF}?_geEo3{?q_#y3h^RBv+ z^k+YoFm+_8FFVa!k7k^rp#Sf3s_&*>{KMjd0ZcCOlMMYOv;EfaZ+YUf6${se=Ib{3 z3YC-C_>)+5uqeJZR(@7%?Escl>P(5a6L8}3$1IA{{jEH^j-0077>nJ=M<{}{$*@_X z6-W0C`_=q%8H#^j?Bh@Hp!-jIcH{In`nR@%$A>Aus0{^tCq^t*z@$~fT{Cv5{)*)xZYaIK7==aS!CgZWS1^G^P7 z!!glRrBWkQw`{pZIrXU?%{QuPvpK>$q1Pd2(gFV^=n}ZPq#Cu;c0cYr&umwTIBG{s zRe1Z_Ze_hcV=p%+5EA!!Z|%5o1#XslY3sKu9XfANdD~xRKzwln$7HX$;yW*M2pUG) z5YgpPl5dtYLXKyi{6%axV9LqZD0{j!E-rJ@(!)_77h3+N|K}MG;_lt}$nN<%I7gi2 zl)K)Dj^FCqTN;{&N`Xv~6>%E67A9F2!l|pp^`@_K;fvlQQS-!QV59ao ztjpJjc>`j46pu2yT?`(^GpQEmQJb*bqoqWoL+THnH(u*CZK8lBu*kheN{&yTSh zq}yN~keG^or&{MeT~Lq4F%^TA))#Hi^LjO2N=%uEITgLBs_!x!v%P=lNlO6~%ELdM zd|nFJltxB8#XNybpuiBHwFj{zv3oTo+Lk!=?B>0Xxrd3EMWa;9RnElII|Ih!iuQy+ zqRP5^wo&xS=A16O*fe^#ie6cc@*O&MwtjR+<0IyCTyQWA)%nqhPWA6)UiXE!x(>{3 z#osS;^wna@{7=5QCYUv(a_myqu(S6?$1z-(7}<;aTHDSYE6i4KeOmXSj& zoL#XFo%uVA@!inyCGMDF+ynU+nf*n_*&Xn8HavRE<&~gG<*B*FuZIv${DsK&d*2a1 z@lHTqn-iTkP2hDg>wI4Qz0ruym-gw}>TeWP_fzMVnfSnC2|lfRo+g2xZ?pcY zbfz=Ur)Ot^6WR~c7rdmx+*Vh=QeYmhK=&gBAF%!|?(>}rzjq%mmJ_!D&!te3;iMNl ze_j1p%HdPYc^WS%XW_(HzT)WLEWcZ{W%)3oEz2K_^sOQN6i_=$=NoNV`m>}iBB|F! z>d27#2LJJH|HlVI(_`@z{y%>9AFhkU8<2G{GCxb^4aoH@8E5TfGnU%idW^1vibG?t zGFm?roagR|M%$FFAk+t^sF)6C#)ZPa%TmW>tZx7T;p~>3^OxaFRi-&tJ`J~$>x`E* zh$2*nJI&PDQ}IE;aks|@BXHUX&ohoG7v}Sp@!PP6xoy4O$%?mpZxRiOx@&1}Bc{%j zD3xPOy)MS*N5QkR>|S(qzf3Ax+b9ojuz0uVcYI&)@_yEHj?MM&B=$J5@Uv*XG(RGE zo>IJ6$HE)b2$!(xV3{KJ$g1viI&G?WY=2_}CH29-IQvi}LDhQ`m3iOR zpMp!!eQw>~eD6IvF81ux&}#~hmcfsa0CpH6Jh4Yfgb!@%6|8hQy2!u+dCE8z($RU` z?m4D;n}WG)axH$3(uA03y7RR{O3#6HQ~4p)z&+EoQvG=~D9}7#SsmU0>UPHQNiNcX zlq%1I=LcGWcx(SAucx=bm}B(o_k9)MIeyS;Ti|&x_15vp$4Nh;OT~b4BC`oj&2LDI zc$|QI%D)B*X6g|pT_-fyIc~$x#oVGzHihW?TPGieI^&_{ck~X9HNovMD@Q2;toQb@ zTa6uVF(ou<<53FPDC@a{P4sY^1~uRG)&6)&WNglt0H!l0tA^&L&q2W=Z)JC<9e!Jj6k*Q<6i&+Dh;u{MP3#@q$Ny4mQbc|=!IS`mY-csWA4=YR0`a3rjIR# zf2v3B}Sokpe6-?{e zF#F9caG2q7%Xgo$SP~CIIrndBe^&wGj-Ij*YDq)=XjZ+2!l++s#My)VI0}<0=iqu= zk70XYi@GjgTuK}|-0KMs7#+N1{mT~U*$#*oy;fz8Py5b)h&jH>D%F4McCMhcxTQwP zc#5DKSasOrPrhpbyl_t>L8>ziOc28^=J~97wcaY5XTFa!9IXhKhW@-%S?}kIq8ADT z7s~2?4ATKWtM%H-ID!G`D^B{|l0J;2KN#s-L;5L@I^U%JEUAk~>UEJiGNir%$-5=_ zU}!olo&t%VC2?IO-hixwk@;CNZ}8uG_J479!k*pXkWwGaG%)3+#|MEs*aNc&-YYud zaj+juvi!jc&Hwq94yn|6z-^G(&f#nhi=sGXH9yfqR?X>@jB>UB~3%vm}z z%=rg`GH0->vY1 zRR=p4`H+>LZR?R`DI`5amYKj}{hHQ$l~gwa;)g2Otv zJa9`0-*htwsdN~MNTP$XnXC&_2DgDwd0N(VK^Iu48M|4ed>?%IT(cJo?g0IVR&i0b zH30o;Sy7sH3s876>|$G$M40{#9zJB%2(&*hT`MNa0JPV9b1&Z|5ic%ODg93C0Eem% zgq^y9^1B#ckyU3L@%xBL`-gpP-~-ozi;pVeP&+m#doqaFFt}^$j{P@4>+341pB-n3 zyvmY~BV&=opOnuZR&Kop-6yw}vYVEG=OsG3o!4gpdxOt@+0C&;w!n`@>)Lq2Hd0EL zO}iPI>{j%+^Rfv%8g3WgWtxY^*SD^~nLLCPpH(5@c<*K5@YiBD;WtrDbLmI5K+7`I z@DXCr5(vY zu5nO@aqrRPQg<_Qiwt+_s!d^CF5PPD*x4L=y_d7xO#~gYZL#hLn~lp9j(& zLUql3so=)NwF1M532+B@{tc!-^GbytmT#zxg`yZ`Ck0%3IdJ#OAO)Bv^^}|HYJ$?Y zi}zyrjsq^$9hC2?fnY;p;N8C1Y_Qt@^izY3GGH@*U1h=F0$m^471fh7lP3cO?XsYiQ|sE4Ex&WM$K(ld4#jU4*JB+_bINnJL_OV(T)Hw5!Skt z?{Us?!s*hhZLbT_cZ^oHde(kX92@Jkb>U9iF40&2Nc3dt5z* z!16vd0L#Z|IWv(?Vu2b-8 zf$-U#+^0ZG_)S5Ja1AK?=jlUFlM%$9M*+R8mN~x4GRd82KTOKh*MYc9x$@GIS_*fb zeWV(ho(m3c1``)hKK)CMa>7WAJ~|%OTA9br?Eg3?<^yy8a21|y=-;mn^1jx^6C{qV zeV7~!?y*^GJz5BdtsNK4Ua(pJ=POS7-I6|xq(2zxTSNLOkUHO_{w%4BNa}TwIx?ib z0m-{1`CueZfyB>}xGoZJ@PF%I|K$z-x1J^A>_C`&L4wNwd~SBneL%httTt*blK&G% zoIIL51nm=voF)DZ*+_@aG0w%Y^IH{Ez}pjf;)~$+!<>8dt7^fxn0_{e77LH+ZD9E@ zI@J902Qxh@#`3M9wEDCB6doV@#?<+y;@xFoUlU{KW#!#WT|@?b1xv3BJr63)ZkzAY zqo^!>1K9qH#k*xbCw!{EHWH)Y36rM)H%;-fi!5HDFW+)>&%SD;>->&~g*VtrRb|z| zo)uWJ^0R2aVi)^AD9<%Hm#M{u7p?dgXSvw-{xsd-39UrEG;rh>jbVNDt4|ElZTj}y zDPQxRf)!NeA1&*oSNd)hF!{X_Xe*g{q_`@Ac-zZF+S@%at&W#gR3#2_m+2H0EN_Qb zrq;c<9n=md9=$RNowx@(t@$kLxE{lx`?V+EhxNgs{i`llD?NZ;MkwPs`&!^puEpOu zatLqAVMxEy9*18Da2%9e-2g5~=bzvDvJ~3wPvZ>T6N~>mQKvJsy$Q4{yxFT2p9Qe9 za#s0glJG@~)`OPam7rev*})B0bI|KbiM45y&)ANfn2%_ewAhS0jjsX7?GT-4s@QV=|pbh7x^Mgw^H z0WKiPV}^gU6Wx({(hHAhI9vYwasr+pXIZdTI}~ineGbiDWW!&fC)T@gBd?rYnL>s= zNIX`i@qrM-1^Gl?ZZs6yM_{AdO7as7hJ7;M__+}PknXbK;favgEQk1Rb9yVm0bu61U8q{t&TqBCyssL5LzNbGatfT2hy zZNNxumtrJ*KWVdf*XB5IqmhH<&)ldV^81X)I{5v9&>ExGb|Cp7Ah^@P_$@djS%sq}{RO}cSNr*YU zHAfw_xAr}g+>Kj$EEsysM}k#OD~eqT%+UQ>kV*8l#nnn|s1Mt&fG3i=lNqyTP;j11DzSd_bDZTX%~^S++9qk4!UC~CX?hrb3?9!$7s_jxC>&LqEBOS)+)++y{D39YRYI0)*&b=mVpwsF& z7_~XH=UuHD3}2XrDr&N!{V+SpYunNI;`<`+X`*sRxj#sI4{!BTQ z15Ooo98jxwho9$f2hER&flK&Jx2R#}b;9ay9%`ebExYg54`?gCvGLn)bUs`{V)jbJ zaM#P-l)IkrrnEw{9Jd89lag&cX0#n%nx53CS=2DksjP4rd3MD8!*gG{+Twx#G>7HTjRH9S?XLqRych~7rH*iM+yXWZ zNi4tH`;EI1Y|;w}r(A1O1KXS4_&LQPz}J_G~bTpxgW%W8u2! z4Z1A6!8?jPs}6?t8;AH8ywHb`mCd%kF#Ce`TT{m15w_%NzX{O9r_>I>bUp-<<;!J>%Ott zK)}1@&C-ZZD*o7bri6~_NcmsRtW?n_zD9)0*iU6Z%s)G4ucs}3&Q{y-3ylFE`p@rt zp-~NXk958$_Hn@r?r*m;_`<9B8Bn@s^+oxF_=OqT(b%iWiDdV3R-{b{{ACcgM9>1z|4 zv(1Uxv}Kzmvl`(V_5&Ns58eX&LKB|@g3iK@LQg4`cQm4^*>F;6mpPuqHniizr2ssD z-Sp1)z%#_4q#$R=t4#Qz__Ar5SPfXOFrQ!VRRT?^Iqf+aCy7-;+Klh*K7_YMiM75TIg&}c&FAjfuZxcQrbXT=*eeE)09 zO}~Ex5T(I=~J@$R80>B=X#J-&rz_HTDr?dwbQT+P4{(ed$-MjwH zHTq{AC^ezfc0ip5unn>o4^UA2J?OKxC`kv!$983xU0>)I<*I6lEA5Qccu81y;55~# zZ_#mb69!*>OG+1B-$ln%t)iCtBR$4PtwDKD(@qGqgt85c&U}vNhS6^2W)2w! zAlb6^UAQgswW_+MIUt1M@2jsnx9Ow0p-tN;`@$^BKg)dl=Jbco=N4WrVcW$}mSm4p z5MK)1IgGmQ6Wib(Z`(glHY)-wKPlui4~qLHr#|Zzp}cWQ>(!y=Avz`+zV$3S(&@#n zcZrW>(@9@(((m>^AIAUrgOR>9q@Mz*^G)i{lDdebUKgn&L+TrlyjzkFM)DL${49y< zBJl=f9gNJ+l6eDiJxj*fUYiVaW6ChBd3bgSUrz?OQ5`VqqL2Wl-&$VfGPWm*4j49V zF?Pql#8G#Si(BJc;w)RgK0Qe+TJ_0QM)fIU=N-$B*&w;2@W&@Uo!p9Jb^NDxb*SJ^ z-Qz0yuiaA);z?!sDGcmv^4rvMr&&BUG+NTMh;p~(pDyBYfeTU8?yIO?p=>t&cN^}y zri`U;kios3PL6>Cl!5AL`d6_%~YKgC}I`5&c%139+p9eaBX%=(dLYZ@Xs~jCs z$gG1g-11rZSwM3RgDY)bQ3!3_wx&f?-)-GL>3VvIqHW$D!9O0ySehyy@K^XEbtr-( zxO+o7gFWh7zFD;@{YDp8$LKL1${aW5bb7*({wm?FRI-&i#mJwpV6C_i&URh#cwgd1 zB6>{`_4_Jyf}fjLXIYjR@xkGPiCTFuv2xPsQ+gDQpxNfcv&F>`pi+(R${sV~(fBNTQJ8p zuYB4wJ?lsK%LQfH%sS$k_v2j3CR)M98I89F0~Mf5{?d|1zs(61E`{^OX|{v_`#FmL zY7M+pZccLXEpQM~E631p`%?TOd5= zS5L%H9#FP2+VuMV4OnVEV4xEhiZi%B>*n8cA%4ZqRh14H6Lr7O>|L|l58u2cvR?CC z4UlV5mrCDS2Pig4Y&;$fu)652^%`j}{E1FZcuQg&aco7$ic^T!rVvr^E;$HBj|{8Z)sPQ{knXT5|-pB7tNv(<+e@iqQz zyl1Kpu)6&HPOtZ&bzgqv#Ed%ez|HcV?D8;xl|PG3^l(IRJM_2OWqm?w^RyWI6BGy8 zqo!fJFaEXhyw~=SUTRfKZS2N~F$OK^$-BX;S#%XOcZpA?(R56imp#weo&pl2w~jdO z2KSC!%rDPEx}~Bfd*;11qxeZT=dJ)u$xPCH=h;|U$Lz4Y=Ae6ITz9ReG59DoYEt+A?WwX zMx~5m&eTOX(Y@yGcXU0)MV@|~rCT?0Jg>>cfOG27YGF4SsIH_b>Lh;%@C7yg^+?x( zSjUMQvibrj&ZcB)=JTR?(#E(8nR$nMPqwSyMdO^XmCdcBmo;fpJ!SC!e8vCsyZxUJ zBk2!D`qq$s3Z%|AsXt5VB9eMtq>c=!Z$R>HE&s;{BY6rWewM^_k$3~L4o2o@$-Du% zo+ab#ebb+ZcxDHo;lhgAb-}Tq^C52ie4RHjuK3a}KQk8Bh&;)uF1?PpZ@Ej3ULHu? z)b?!Nzvp!`Chi_C)PwS-a<&n#F>z@$gl8*xGKE{%qjGCv6|rAsm3$3ES#ra>UZwMwX6DtLi7OCSh5;t9%oacS~KH zoyp{bF+3EGe}4LBp?SRQ zIoeW;zE@iNpdmmVaxxCIypO`)8!;3|(UV zbGmPZH-+})tw>as0ex&~>&+K$Iw=DCHeJ-Mo}pM257U#re4(duz52H9lK{SI{S&2E zdImU6cFz^3@;x}VFqYPAZA}bzy?oBKppUBxyFDPZ(!k8mncGI&O>yprk}D3{+u$;9 z3R*k{GQiI>YgSErqIGN%F6w7zN94Bmxb9MmL2=cjjQhJg(lOsO$PjTNvhTgCIoRcn zbR2iy+bHgU|G9nqw}K`OOmS@cGwzxPjcpv26?Kjh`yL%SE$^g`huAW%+FsGaUAzS= z?QPtE&-BYRp?1~KhS$A){<<9@e&UPE8U+`8a+l=(7FB(mmYBcwUC|jBioK5C*;WcM zE72v;DTu(n#D&DRIOCY1SB6@WCBa=EsJQP|63pb$Ug%$O0}5&HTM?5S4FdJF_qBdB zAS@=WK3(ncH=zsi#CpNaIo}O-<0;xXYd+$oP@!Z4-`&{;cw0*PP4)#-_|r<} z$gqhM9``Cu-QP5nP}w;d!9T5n8#k-hoRqdCcyixI9J^l$w_F`suF7*0KKN|se&=!q zXj~?5byutu{C1qrE3h&nBs4D@u(@jDh5Sn?B_HZVu_#iJuA zZeXWeMFS{JiIbweUl$md_#<`OLk*ozPmp9k{RAyGqfeke1tLf{Pim1 zFXL2%M{p3`N~2&(93{RQUKCWuN_RSMN+0p1r!j1mp=u4+(J@CYgzNa@5jwzy#$_bw zaDPSf_IN*j>IAvtY@l($jkAc z`&a&A_^jg!ws92#-4YuEdPP@5OwRP|kCXRNe0`JjeC1U%?w{?~`^wSwd*{WuY|5*O zd)r=iSV7L|_h;LK0)QNSuo+%M_>9&wUK#ffqU$xoasJFax;{sAO=-K)@st`iA7v-l z*FSEWlU@fcjxQE%8C*Ie3SaK zq%PurdR_n1ksn3BD^OKjnU&Grpq4KF(4?jR0QqEjJqch_mN^AD|a>Dr2o2{1TGY=sNs~s3 zIJ@<`!i~byNsLxnwup}3emb^e=xM2x2Zc{=qVTGw1Ih;<3VGg)Q+7XECJ?b$3G7b% z`l4QR6-a+o{J=kW6GVCdiQqGhfNFd!_oGE4cqX|u$Z}~VkQS3WbZTiPklp`FU|&ZA z=vOJfm_F=8X!%h)*-WFrbT9Y3b6^=*QDS&Hy~T$Zt(>tJkIDtT>n8c(-!p)A{N;q_ zNpC_!(BoL_)if|>6wLFvT@~q4i5MEFD7zn)iG<&+2dk|7e`iJ(+coDzT zh%K9ju0!vc5aIIJlThXZ_C`S@7hn?U%d>=BiS~d!M`9&U5DM)oe*6NBu=>Kl=oJ_U zYR2;7=5rk(#{OuVzJIuo|E_-qISh7tM4sA&^|*g6OznB0FIiL!Yu!AO{PV8C ziFGnzFIW2E@2;g}Et|9=9*&f_o!4|BjvozL%zkBqpEzVuI8{&%&NRIAe2M(7dnLTX zlT`EJ^3lwgIJP2~czMZv>Alw=v3j(}y)K5ad+^7-`K&1G4?oc9GIJF!%=W$K z+9ZhepSf#?o8loxshKj@DxhPMRiApFCsN)#_;RVaY@Xo}?0D2EgdJRcA@Tk8-~`Iw zt{3^3xIo8L>Si|ZEyEAeYFao28HxDDDZ1Z(<;x zcWy8Dsf;+#ee`AihNw6|O+&*{-5)fn?Xj^Kq{?(9qpGB7+=R+SC4!Y9Ax6X_*aW|9w7Q3*q`$$ z^oL8R)lzrL!k~XoZeoQ&D5%^1TEBIfE!vNE=h(W}=zWyrJc7NgPmV2*_i zTmDc&LZ&vZYCMSis6H4And^gVp;`@!4~<}=x$KPiBUPYH@Yp}>*a%m~dF@|QX9s+Y zZ@akci~^pYjFhb1qXFqFPWs)FK8&P480lL>`YDh)-=zL5sf$SJb&)zUq`m>kyCwNx zBu|0F&yu(<5^q4(!N~k9nKvNUvt*prYtYORIx+}1beQLsTLpu-TjFNX*|}iJtqYQ_ z2mGK+_~bpyg`xvd63;Alm5fo}G;uE9?5hucl(sT`#et95Lzdqy`}_|HJ*)iYXU5A| z{$N`MLs`Bx7?$<&mQNhL_|iX}?fNHk9>@*i` z!cgs8@1>zukktFy#7imzV66%ct1&Ko`;#rT=c=!Rg`gcKia&!&2pp6?koT7@n8=bn-yZfllz8la$Ss;X0QbHc@_ygZS{U0? zTXZ-18nl=G{44f;HHbZ2t7s)&1#-fyH47cA3Ekt|e}`242u5j0^P{^yxQxrQv@p|J z$Zd4v^pRr31GOnMCAcnwOLvw8q9r=GE37eD8R|>O^lNW=80|t(wBFkmKRZsun8NmX zTTg;+;U2Xjq5$Z|ynVavTP-{~W)~N{E)D9}sz2T=84l`u*2*n4D+dx|n*^3VNT&`( z?)cTgnL)LDp(b{Or;e(>{+6s`)EL$JUBy$GTU6k_*?x_za{w4rc*HT18xBV0=W1O< z*8)sGl2^Q@j@}=!jNfm1CzPU0?>ai?05Gh@>HJk8Vx(SqyL4U9tUt={2 zdarg9a>n93@I@-L$QSnM4EILuqQf^c?AH`!tF}4tDQnTOfone|8iF<@T-*+zV6`*pIv*kUFe!JnwQG& zt(RKSad;Z0K6aq<=v3n|_lli-x|_Sfz%gHL;DQXQ1fy*Zo12~H6sz4V(VpgoZ1 zf`gU@7(9K6>^owFtnZ>*~w|y zOUHU{Jc`m8q*Q72F>zfW@r^kXZvcyzIy38F05dfA^*`uJPtb9Acc;ORLgQ-N7wPUw zzyH086=ypye-GQZs+Zas{XT!y)FVd8d#8^>0_X)?CFUW+Rtt~w-dbaY)?Y|#NufQo z8T#CDui*-6g(`=t{n=Gp;opZ{e=dyFz@(lH3f69yp^_p9?Yh$nuim_(J@YmgZyR~A zHZi{dZVQslJ^r}{hW08hl+G?>`kn zKB*%QH?SXrduk0Y8K5zCJ&({d=qh0BU0-*uSF@MVzv<``UD{5# zNuMv8bHW|+uW>T=iH(MD#iE7Qc43f4*`lcM=LUs(Q|+LK+j?O8VBPIn3zSD~NOp^t z6vNdjD=xly>j~OQ`PSaQWDaT5ZiCCj4e;l?HT*dYGJvR}WpyWY6QG$>_f`Bjj@xZ# zKiqXK70^xuD{r}~O27R6q_=2uAw9$3qd1Jlpn03Mu%O}`b;h+~#1`*nV6RLw(pI3l zaI<@eG23cF%H^d}Y`+auOYqKdDANURD`$VN#MeV?Usvvxce;4%Xl$x{p#>4qK489V z;xYuQS8u7l8~_^M{0+Imr3|s0w>Je^z3Euk=vDSRS@i1BoQU7Y1}VW$-`A~|+zxhK zzjR#~+XB^=);@TWyA&*Qe|U27+b{#G_B$9Dk9cYcuA!ECh}W>bts!ytF#gfz`+;wN z6M&3tuGx@F2-K9fAJfU!2i-qDpO}3ukMb(~m!w8fz1q}ttX=3M9phHt#UZ`~SE*y` z9<*9b5FsL~IXlpL*8X^6NTm_@@P0<@PupHt;oW#^R7@BI%dabq=a^%p-qmw{dfOS^ zYqs#>{ebGU8Pgn_w}iBB|F!>d27#1|;v6DtveVnrPB3+37;aggnfeCQd_5NLmVtHhoRn36LV2q3Xk`QcQ~G!JNhW?4 zKCaGX;<~_&?(a;z0lby+bbFb$9VJ8PsM_2|ANn4)!M;;MzLb=m`&jGQ$&i0>Hqvr~ zu(jzOs-gCYMW>_J85gk4*ZrpIf&aYU`*7a|;O@LvZ+9F6tXY;j>iF$8s5D*hk_@>G z9{Y(A-Q~4_{r8}bZAC8N9-0s-5~qXb@Jx4mt0%E!Vb&?zsU9-6p89$GUM@&AujyMI z;7sUnxShQxdjnR#kXtXbAH`9-Paih@_9Y^VB?9Z%GT`0FmsMtePeU7Jjq$lOXTo^? z{5S1iK5)6#kH$6MP7^CfJY%oXwQ=>j#CI;knZW7jMrpHCH{j75&z7Z_RDtZx@I6D$ zkT}wTnK$n9z{5*|M|wSKLCf>gJ*BNBFvQrv^~>iOGCVM9JX$mTg=-p68EqK2iUcVTkl}<07PK zSy!Qz&?${2Tvx$LYKR=?_Z(R2Ah^giS`YT@s=cJtW5b0i*A_h0w*@+PDh zpHA#7)yIP(93D6D#ltt%hiTs&E8qjCqe1C^a)I`{u#C!AmGJWBb<$js_2A{424=I; zg1YRlo=Tp*1vSi7*vD8tglbv!>fK?5YgB)p-_JgVbAzG6b{>06Bar`k)oSxLUtsM3 zME<0q#`WSv%~b7X9E*e(_Zp?{yk-tDCEs1)2TpPt&y4V6Uhd#yh9i~3sPLjEj1%1GT< zYwOWYN8@u!m^Y1~btz+|W2hjGWz-nS{9H?DX5;TS(*j_g{e;1G!y|xNGuSP4d_R=` ztx$H5dk0$I_Gvz;2t~M#EV-NVofM3VdQd!R{sQef)H zFrFl_^bM$(mR7^9pgQ_**)LtKs<$X#D_A@QAZ53biJygP;zmqd7q}}qrMNpOxMs_!O9zKtoyf~9pC*g&fbgusAA8bMvb#uvj0rR<(4k*=bt$7 zOD!$kYD$jABP|g&Zhb9NyBW$1Pm$!AVMd_C%@d(3j>0yn{XQetlVHB&%AdPxFH_f; zzDfAVyO&a<-w-41bc4~1?Md14R1&OKSstO;Q%&jdcpe=Rw;#$FR=BPz4h0z7s6_R# zr<5Ri4PpL02`(kLL+mmx1C`Ti(^kiRQ9gK_%L=l)4o6!`4!=D9t66Y)QU!M!8^tRKWGH9*Q3esXZDO#{pQ0}j`(#Y1kn*W2)>+l5q^M^@(f~7EQ)QmEQw-q@)`IJ}zYV~hTLpe>zRlYC1 z$YK+a7f4T=F7u#>&2(-k8FZ&;pY4=gMv0~0SlvAKGTNIW95q&&X{TjV~cP4n6ot;nOyz%r;rW&x@fv{{I)0jm9+Qo zcyJWlyZX-i>Y);7v*tm`uWPrU75c#1MI~U&jLgIJh_}|HTCz2M^CtYZv&o*)lJn3; zNj6g7^)@g&t}9WnZ7*>mL%N)|rvlo)D>`0zs|ys0MD;#Du1jpF`_SD-!au`|<4o_N0M$G~{x1|qquyXM726o}e1YoQyL3?OH0 z46j}m>^gJsN?#(5*ST0$mT+kjZ+>)t)Nd#UO8veom)xlZa~j1s$2||B@i=-hL49zib6Rm<&vS$5n7LyIXyQV?5yIY!1yAya8-Vq6{0h z-+--Kd>b_mrM4&@5mN2UZ)O}|DDvdF7(#v6T;o)64sP~RctT$rbYQVFfh*^ekaGW^ zY>eMrXbb5pPWs)FK8&P480lL>`YDh)-=zL5sf$SJb&)zUq`m>kyCwNxBu|0F&yu(< z5^q4(!N~k9nKvNUvt*nV%Bk*`P<@4b*u_KEO%Fo5J)b5OpWlUEI`_8x9B6`?*7hmF zlGmY7NJW9blo#=L$-{R$Kj(m56!nrDV&#x$_4a)&`GJJ-t$Sw=45tGV!>zf=B3W?F zh$PJ2Z%E`wi}gPciUeOTaMzzuFvm{`%$N2iIuoxQq^+COLqSQ@ro9OdV?dLWT$#k4 zbMWZ=lVP4!WAwF4x{5V?-3e2UWVeG?y}{Oe=WFJ(!Jv2b{V#j>(qQV0az;&=D`EBZ zi9_|QBe7GPXSPDb1$VzK&&O{CRo`#Tf_&6eIf!E zR`A11+#4)^trmksp^|^gv0ac`;;3)igb{paz3=*H3k|-nsP#{BDu(T^nlt)#P=LSw z!_%?pixff0=F@xH55to?S0%sv%+xnITRQ!XA3qa6lLh_4;mfLMG^&5lMz7x5PF#qyd1raL48kBF{rpS;5T;iwYc2_f+x*z%cDIhx<$V&C zc?W-`cVt*#eD4JDyPK}H=yXtFX56Pvs{k#~>vmj$LsJ31lBHdz>+quaYc*@`;X!p4 zuM3sygwb)T_pFHDfhR{*za7#z3r$Xk*!gTq08h^@AN>3t>G2ZrL5ugafaLauRju!4 zDf-t$>p|5=$|3%~;dAGNi9Aiucbu~lc!Cp!dOGh21i7#Ornp&yEjzqdg_juH}=U#sIA2dSyoI2+d79CJ^ zuvhK0c{hCDZE$rB<38-2OVHCzX+ra_y13-68UEniwS(CfMQ|_g+ppXc_fb2|en83y z7g0R2LXCbCemxj%8z$X@+OhI)NGE23zWSL(NgdRiy?DM3<;zif#vU14;^McD{#_-N zsIIRPB0hz5aY944EHod&x19;D0HrD5z;Izw`}a(sQ9HeH*np**QIl?NimqRQg`I&N zA%CaM&VeWcqhos;?%%9I_`$IkJCXkn8lS!3kuSQwn0;`k7m@4o&T7kSHQ4Ip%lT%! z2_#Rv=6zar8}<9Ng}&4t;og+`C%wH<|J>QNaV`XhmycfP`Xtc1oY47nEe4JiOgV1T zNrP!xg{#K=QGEds-!F-?bP`Nv#h7{_v?J2I9>i>0Q=UO|IXGb3D=&QvL( zeE5E>^2v#-x2cmHxf_aljVV_~IwFc%Z!>~6)?OG$UW4ZG&sUuEyCr=X(Rs1_!ARd4 z(occZ`6l&eNnJ!zuM5qe)RAH78<4zPk`G4m6#m1{lDMw_;SI<-7@40X^9JO4mW;E< zUU7U5`}72MZFy8)+|UP$GJQ`6S$4sdSGK*TFlu0$vxbAWtPP>H#UsLAITy?vvfRhr z-wd%){!eO4Vu(|>l5+r;^)4&8tKxjv6TqGEos-*lh=ETphM;l)o8$`|b) z4b3DFjm3*2?XC%+(zmPCb|wQD{37bSr7xiUmi-}gPZQV)3&E<07$QF9-n+kv86aW5 zjC5IaKJeUeuT#bN3Zxl*tiOTy0K_?oH<=9)#1oV3_|)Dr%sMB$c*>NSe+Tk8 zg3P=S=v*eo{9b`eEo5d}vO?f?tK$wk4uryscRnvMSrQF<9)I9FeEK|0xhef_=dp{B zm%Z3WHK_~+`&h+{FKvK_23;mn+=4*i0ae+>-9~u8!;Hf9=4YWU{4F?Ykb>^JspZr< z6MViRg&28L1c$-gc9-%3)Q+|8{bf${>KytM=X47e4;Q5E$*Kf**)f`|wHMJ4>UUhn zr~~>=?FzU!zXjJ>CuY;}`v@M&+hixT+y-U|9PoY6;Q+e+f`{X~(Rb}~w)u7kc^q5f zU{fS-gHP9;d!6r;052D9sPrOYfNXPbu5YUyx{eqIZf-))KYB5yPY~7q8~jGIZo>U` zHc@7)i@{9W?(%1E&%k?DOGf$L$Pi_kd5?B-spA3XDK&3@nd7uCQjc{xOrW@9W&M|R zRzRbC``$w(YVfVVv#rA6_2@ek6F1%V7S&m>yW=BU(Q#-VhxY2@J50y5H|)rQjMlVA zfknx{$IfwE$TDwuWaw+q$8G?`ug&~fnCHX2ZVNC1PVGkeH@`$DKcl){l>L%vmtFY# zAH{a>#2lf9@62_p1X~cNT>bF%>qC&c?D46;L2H5J)qwR|@UIO0Pz$v$={I4@xp_&| zrL|Dh>}`J1k!HC0%eT8LJKNxqcOoAho4a9@xG~)A+zW*_CdNKn*9*n&ibk<>RiXRx z%JmFkkB_|-doz`g1tmj~jI?tr^L}9nzDMxYOiZrJDuG(QlDEZHce2_?lLCpqJ==Z! zEX&{p2eGa{S6jig@xOme5lnfSnF?mgpItIVx>TZBn%|9&vf?tb z+GOd|V11spL>(eoab7fDLHoDk0qA}_o}sim z6VvZUQ;%`_1Nq`m)u1Kr(9=_E|7L40ee{rxkAYA$+cK zJrp^T4crX%=&g4m;bx=V=g%EYgQw{ZZx5Vj>SkdxJBobJaWJgpGm7)*fKgwW$Hk&A zKL{eilXSiOJj~($W9rQVsd(S_@gtI@RFYH_%F>34#2M$Yq$2w+`@ZkHWZyz|LbfFP znh>Yu>+~e}*ypGjq>9*L~gBrM3I=6kXm8_*?b^e-XPr zFk!lG%r)f!@{cPr4@!7q@#Vr+J}VFqwH0S`noAmYu-}0;?Ch(2J(V!cNZnkIR|DMe zg`s=o9uZ?nArg}HYR~|6Tt8IdMLiBjx=#ya+*cg;yT#^j`7q-CU{C(@t@-z#p8~G) zjqA_ix`^0nxL((mjts7EfcM?v{b2v=Q^5JN|M9wTz5!kb!|_=hH^B2*JkH);{%&Pd z@EX#y9x?uUsTZz}PyST}#Sr~$*vZ`Bj$SQd;o8TM0#3J-FMdAKgpISeFSr|k4k|N< z?>Jis8V04^vsEb&v(cMm%&ut*wZW?hO8BW69MHkPnt_H zzT@PXQf_M`*nyi@PBIMRfA`;cvaX2)(Cnr1N#LWuv(s3e6F7a!qvG!%iY}Fk3qcz; zCGg259n#iX!GP6tc>mYVz#D{osM-}4JF@Zs`a3V`3eY`-v1dtxyZxG=Lx>0&>1~Co zT8!5vM=*ZZshk``q7_0SsNL-Ei-DeHrI%0DCSc<(qO?6o4Iw68+}CD$A7ZTA??&C3 zfDL;N?3?}2gi52kbh{g1W7=@+HnjpEWmin-sEa~ZW1mFSs<%PZb}{sjnL1LDS^x3y zWlU#kT_K~?_$rvKIcfWqO%06wJ-}QOD~j=WnUYhRG!a7g{sB`q4`eXsx>U+9Kj>hq zmGtMG2PiCbO}o2Soq8Rm$@5!z3t@5C5S80j>T*J(6dGMEiim4Z0`HDCgnd|bg2FY& zF8ovyLLcye=bM)ek$tP4oCem8NbC3s|8JFQup*^WdA1$^m8vUER!JAY!ZQOjmko>i z1cJKJk{I>*(yLZAqCRiKIGAT*?CuHb^s6Ky=Ffxsbe8j$pS9ss z2>Op?cM0>~vR!-m9P3{)uhS_^!1@tl^8ZdxA4lgWTbhGht)QOLo&!QBBtUFp=H|BN zrvN=u@bmfC9N@`Xm+tSKXW{I;l~I&;Ib6C)#`F&=U{Ipf-_nRWD6ASgf9Gy9WV^7p zW<#+P-nKg?G)D6rUYt7rF<$EhJd~+IcOI(~W5Tb@jgFAMJaoN-0F~wSEe2y!HR$6jUm>zhW$uuQgte@C*qhHy0TPwx5hmLe(p&n zNBTs7{FVI&io%kq%L$%3twvq0zloI^H-krKlMPXx+RH~LrfdLk|J(ejRu^t3e_LN! z(t(1u5B#^&=~It`FcowSTb@7=HTu8!X(*2A&$k7GM)zc}E6}1@cf|p^zb9zE?G1o? z3*?Hv9|_qyeigf|#46Z4Yw{U|h!~e@?-<#~;inIN1nYBB+8esf}v4J`HCSc~g zqkbY-9OX}tGJm4t1GS{1k1hPc`sE2z0Wy4Q=tp$ma$i>>#C^qazgygg5%&kfeQR(( z1zhJF*Pq3Xe@hn;n{MfK;W{$7z5(8Mi}!=!eF`{#_V<6hE}Umbw<}|K4$Y`{L~fGC$52vmT1j!{#aW&Pn4%zFS_O!=8w>z znqt5n2F|G6P?2Vi!nQ|v+Rh zM`u0h_SzwYdBq5a7arL11LTP}BB;D^JV}~x12UTbtibprAa7>@way7TE0$kl_&XF! z`qNfdAm4KT`mh*=F0{%_9EXpfrm@F-Er^d$^HhGr?l#G;2TB3N2#U@ z&20rqyVV$JiB>ctaaAw%DT$9SEmpc|k#vQ6TDOggP%J$Dm?&FBku~I;mLdbPi0otC zhl4}e;XnNhnM0~9z=`klb5Vwm6#wZIFS*igB6C`O;CrSK;^~0a%QxF&VDU<1bmxZ~ zppC7_ab#Q`Mjhx2W{l(|?hUkl(qnvuOqW`C>9yv!$YKH>Q)+@{JnYR8d3*!I3}Y8TR9_#B8xypYFyCwulH`3gEO5?n#Dkeq58QZCj~_2nz~pz zx593zC!gjPH4)xh{9ft9s%YsT&-i|e>!A9b)i>s~8erUVT$nt49|orERBUOwhD4SR z^L5i;e&zgz0U_;bDBXnx;Q_T0Q1b1j*QQzmBr9|F{Bf&=6Vtc&gu-gT&ro^GL~Tj* zJtKG1<8dpbNtfCG?F|*wK9XfDlc5q6Jl%ioLZ=JdH zC)*P7+s#1`v35q^KQ)wcTCzYEju@A0mJzYO)!1>j%oOlJP-L4{Tn&7Y`o2S8-)%^; zqYrH|r~~E6^6^iF%i%TW;UNs4ZQ03LLd6XzQO&B(Ao4h`3%Ct|No%8{mDnct054r-1Wkab6eBH^A#)I6jNx26#S; z$65K4mo2#7zk~ZPUmRhN@j_l7kmYV@4+ptBciXj$4MNLLh0QnKo1h}>eotspBY{L{ ze8pUN8?=|HFuN_Kfo8rN*%&(E0}gih4V53djlNhSne=G6po=btGJ{|NXzuO{KeHnj z^u2O=muyl4!-E`h`DUEaF;#kN%m)m5-)=0R-N-$|M&Wm z)J}=jYJsGmDlro3N$naj%RG|$JZfc!Gm0d?+(9=UuR0JrNQNop*;B;Vn+KcV_H$@< zKn>;WyTjyNWfYZ&tG1K@gP_83r3=IwU)q?VmNIhs6`wEn-Y~#-(~F;;?SD!9uvYoN zV)+NTkEvtuB$EXs_scg82<}0e9VxWa^UMg^5ZGUJDGa`*FMnI9e*%%!Fwxw_x1S`~ zDLKm_z)$h%`QA75XF;-pe@x$FdA2!GUH!q~YpIfbC6!CRQs0t1)0tbYANWQ|e1D)i zuBDQ=_m_!973NeqSwT*nGS()WH_i_e(+ojgYl2ivnKNw2i`oaWd62dAbPelpev~aB zb5Di@3$p$6>8swVrj(sCFApjv2ara)jr6CzLn#{kGHjKF@Gs z@%#T$tB744EqQZNH;6yA@?;Na7L#5E{oK(p)I+(*AaM5(m>`useK6|bJ4SK-1D@r4 zj3!!;=0DQQ#gPYM#cF@J#1I7qz(6fpR~^jyq6Pp5wxjv?YCSLRXyRgj)nHf_@-t4M|pVGy4 z0VWIdb*0-sZv%5=EZuFa%&Pzco+ev6jD;a#xWb-1~n|ZIefZ+kMsq)(`}K%Lrmkege07 zC@XZT)`MpZXUu{cQ^3?;+A$#xL!{}2iuS!?EyN~6>|L2EK+hIjt~Z+2KyNEvYhkcz z1Rim{VWKx`;r_}Ug?{W|@LL>Ku+~lo@AojCVmq#n`HK|lOgSo|*PZ&#r8N>!2YB^d z;me=o@9A7DdM=|xHI+RftaN)NMO7eQNT>nP&tD&_V!tHTGCsf6skR$cc^UfUY~GJ1 z(NOpt5tsGO5ctd{!UHSK?3c zwk{rpxUV?wcZ>Tl;{IT`Zw>CJfa`qY`m?w$BCgkk>&W2x26*2s-VcWNDd7BBoY#f( z4e&Y`j?dz_0iMs|arR?i`%PQ*ap*~}b=&0iVbtpUqm6e(@@Q*!X62ES15iTRbpHEW zEYD2+LE@}DjM96sxp~Dy_(>pA;&z@b8tjtB(DA4ehCVSqSQL^0YX3aFUSeQ}aEQqM z4rX#fHEt|&zbnD`oRe-ZpJ2MzxpWgj3bxUpmCcMvOTiqe+EuqLXtxf^)bg?2Bw7Z! zRAN&1A^8?`C3+2uNQ8jIO@4pdNEs}YWzhW z#2&bpyY3bY!-*SuSJ&6bl|g(SC6D4kbwlVS(KqqXuv{qcB992#mA2h1qT&wtxz=%+ zKJJ6$6-~!LUk?_ts1e~q&O>Wb#NzT$+D|Krt;1gOj(hSXEzXqruoqfnzTgI;>|7=} z@O(|vv;j8|`&`LfL_Px1X4-@A3;vQAT4q8X>9Qc)tfwd0)%T)P12tr?udzT>yC8w$ za~sCrOg-oS!Wra}85G4n?ImCT`u2gJI#vkh+v761aGgBOv{^WmR7>i6%=}WfxrL%K zdO+g&t`d?y2a9smjxY){>n{*KSWh(kvFRg!`~$i3kEsw>#u{0EWrgv1kTLk#vv{P_ z&;cey^v04$bilW1t|J}M=b(q-`X$B;tUs>{j0eGlZXy4abG(t@*7`F-cJXDJ(^G~%B9t)wGGB13f)E$}4rK?_kz z8@y6giv4@s24r0cTc1h`1Y83iE*x=pL0sj)8QYw|+gn4C7+ z#+sh~T=4<4cRH0cO`8HAX5N;HW4eY0FETsG73aYFVjn{(gK~(pUbFrlN-a^gHRALU zqePf>YH!=Lq6VCP^hd35{~hpsU1K0`MgvKIxxp`Gs)TwCeCiTatAYCbp213dSs+7o zwl{3{A#mqppN+0GL+V-TMcOVYp!sj5GA6QXq3ENw!Gk^z!Ha1@>o+_VAXfAx!TyOF zDrUUiR_G;)tj_R6q_J=##A=t~yD#kE5B+`dA7L?ATcxv}U^RwoqQNpZk}yAybs2x6 zr!l&5p23-Bhc5D(=wj=1XqCJz@*~K2kqN5e?eo#eR-lU>O>t|gfZ>f!-&9y1LRq89%Fo5ScPy`{?csjBg-C>};df!AR;oK~#Jeh@~S`+yFe!Id`9$&nk~7 zC|_?~ZhFpHIkDr5v6SGaGntVCoLp zUh?FH8^>}mJq+Ae9QV7$eHd|nFx%w(paD4;3?-uU|!}}C) z{w&Vx!ubYx9Sp~3aohmUXYn{Iul{q`!ekuE{g=<~KU3+c@yZ=tc(87JT6_TJ{Fl$Z zo9?-NDbNVz;b(lGh?T{c%dF0B_qIdhl1O)R4xK{3JXL%il9>etx+QoP*DK*D2U^L+ zZjJR<1V5lzvqX;?stW4~8KS4huMKPj-2(8lTmYSMF7&)Oq=39L0|Hik!h#R15T#b# z{ADi{l1pR@^vm(X^v7-ofe__-Cr#gQpgTv`plKTcDf{-Qo_eYP6k;z<`esa1 z#x*$KRWW>^$Q|qR%d=df*fpJet9xoEe82PNm@k7IB(|uRT(kFw%+@B{2lnPd&LK)$ z4Z;H)FR0VyzL+EC*LkmRFiF6erXG$Da~KY0n%Je4O^algwcfcZ;0Md=+O<79=O(gSTQS`Eh%=fXU~@Gs*=&DZxEzG*a)^$lP8_%DfGSkwXGSZ!b#OLxx`!93tBd zz*lvq6{F{ZR9q%L=^LgOLddt6X%a((eun2X-zGv|LVu^*?Z;l|5x1Td|a_Yjf#bS-3;s&63-KX%A z3JG#8uNqvqxeZii(?9KQ`amjUV(f~~t%r<|URsW)0d8;Mb*cD4fzc6r6?u<$LY*AT z3gde(A@5jgtt(?cG|o1YK0xS%QZo^)a@HMKe;@B_KMPMpKW$YscDxQ=xK~yc9n*`A zS8!3T(nTaD$MTQrSqdcV?IelFq1177Wd3xc-L*}{MPmlXVHVF z_Nchu!|G-J)j>K7Sl*$|cO_U8tKYgZRXPTwfR}P7UG(;qg7Kq2Ziq!=dKLsVwZ!u- z*!Xm>w|eL!3Unu5F@&ds&o76DHkO*feWMPG@9EUKGofq#q&b2e*Xy)GH0ZcLcc8Wt*z#qeJ`!jcaI^~1L*%E~=s@(zRA)+P& z{C410rajr6lnW)89!km&Q|k`F^Laefb67RFQeCWzKcHDX+8rxt0;v-iJYp# z0q18?WmT6jxYj@unR`3|rdBJDxBEwdvy(jMP6pTl8`eX~G1`d~*6$Cgd;_>+)`40F zgX3Q#srW41M`qo^4Fu)<&Q1tP6Scd}ZpGQEF2q*)AR~7C(&(-5Vq)ftfEfviJ+N%F z4_vwJj?E`rj13$mihY$4aJ?P?`}?H?3XeLIabI!V?-uuA#Qnjr^|pL#uE+Veih3m-R`UZI4E#41?_bK4~S)A8}^9}Gi7>>{4xB;He;&Ik*V)ayG(OXC; zJ4HL_?}1jiUXK2wn+a0x9!@FdeT9vOAQPRx8){M5rzL`>gS}7AzdZ0b234!Qe6@1M z9)0>rI4t|;0{~7O{^c1^0KCSQlcKKHVe7FKpFbjj5^^@R_gVO%Oqw6^e0NlUW#PSXT4nl^T3vjiafh;$)d10m?l`B#i27E6ScrcWiYBn_r=&3Aa5$O3gw zW@*A$vVn(I27{J)0SqopJjfn&oG3jf=enJuMLa9^qEMnSm?#$e=UrEQ5%KkvvtsW) zzQ_6$j|7cyF=F@8qUNZt4Epe2($77tP2>ZLzV{W!nBaw6)`xq!CyAJDNYBVR4U&Ja zRzN}+(>Wn1$26|3Ve1nHQki)W`qgez=QE+${3-js-6yd7k-#+JegzRRmEEEFJOi80 zZIJTI9P?8MI#*t`M0(A-+;TRT7XIlT)9Z|SMqGP;?GG(eHo0bqmHFvtBiT;xNTpH8 z1rT46=XKd%0jiyTdo$gI1_=&1cj^A>3|U%VP`%4^m|W+%>%cQcf1u95BFtKV=}Hkq z9?xZ5!1m8UhR>+e)@KiN{*aq3?nPV=p@H`&!+=>m7s%Z=e_bz&7sg*qU?4;c5DA;~ z$9nq)$Y5G`awTDm%so*SXCL25985`MDF3AjMdlN~TU0rND^Cxh2IZK}$;g-vsYw*u zeoiUPA5KiK#QwwcRxBRp$CJnwf5@l18ypzlvVdeG^X|(XXJPNar$saCOF+J$RUso; z4nDYWo80NLNSwareYRA7jZ7o|b-q^bEBSTt;qB39KNDO0TP0=~>WM?Nd}Z%S*$}mT zg*Azmyyyy?*Rsxq!s@Q;zXe|rQEXf$s(R>hcV!T8@Z}Mar4&f2ncNHQD;16*|ydsJ&2B475KaCgigw!J0R#_r6)0qQTx} zf`W@z5osr~mg<}ly6!aE=bh39q$wp1VS^>u`h)Gkl_uDBkH>}|*DiGKX3%X{Wb~!YF zS3$EL$x_rhRDa?Gfyyh3u6emYt-ldQB?PE(HUF8Gw?1leKk0lyZY5y+8_hgnSOiVG zA1hZg20}@MUeQ{f8prPmE*RQi>m_pjURK1`V|#v5S001nJWvU1aRMT5Q7*?7hXrp|SToIZerU3peO6%Vc1Gdk$Mpcu$u> zC3!+Yt-!-$fPy_on|eF$Q`bLaXfi?G6`C7}o>QP+FV_QtGvt@m%%_tp{Yk%CibvPC z1^mxf9QV7$eHd|nu>bki{O6~D9oCl4H?BX6>muTMUAT@6u5W<%-QxXVc%K5!pT&7y zINt!<{#G3f$7gZe0MBRfI4h9ivZ?%h2nG&t2c*zZ{R9Lx)DAhI3ONUXghDSQEG|nP zRrf+=Nd!R*1BK$xNAD_p%>plJiYorjyCXHt{F(7LZBT>3qJsKiJ5+%EfK&O$hp@1# zU0-b;(`Q-acOn*gBLvF^7JEZa?7k5j+rboqj0OeowD_9?v-3qN%s-}pgNK8Ae%%ZJ zwmKL4^Rr@+nsLTIF&~@|CLTwoLbe3tcfgOq?i-27?+UM5am7DLZ?2gr-8ryCx-ygb zs5yid{F^15GSuY+iB}Yut@_mgm*~(w(ZVp`v^#7Ph}{EvFaC_b2or~)!-SWsVN2vU zPg%${=w;AzOLaBufg22VB)tAp!i?4@s8aS0hk|v5Nl9l~YCPTHMf!0HQM@dvrjO|s zNQ>^7z4w^P8&V^#9XpED5tLoj@+!dVE4po-XKs@%MfOjJE-aEep2_RVb&Fu{|3nS( zQ|BN(~qdA4ylKtrS$gz`TL^9*|cbvAYzmVOThfj^Q8UI+<3v&xGE_n5Uk zW8_7-IX&<2W%8h%(La9YD8z!3qqpu_cBaD5>?gm|BAD)(Y}=W0Zu&4iYJiy_@tHhh zsh6i#^qstmz1%i6oDC6^K40^1jt6yqACb20g%VgLBwQF@uz_)jIhgOKAt-X~mpwEh z13ks17NmZ2fC(D`H`+V7VBZx{7cI3s5G%yg>^JoQ=$#jfbAD3^LcmMEVYdblbL2+l z*y%>l0Uic>T&)I8|D?-y^MH02J zsFEwQ)k7D#n8swxlc9#zsy^!uyL%57jm_UaF_s3{KM6bhyMXZoZL8>>h{I~Hvl>i|aY#WpT?Yw5)4M`*!vX90p2QC~O2HQ$ zsf;(b(;!Q2ULNl!?EVb?9dCEi6?+e<#zo`-coChD6Qj`z ze)cWZJwEXWu3Y<`Sr>)zgMx*e4$Pg#;&+NK1;}F0CEdcuNCT9xM|t|tGifyNz4EJ_ zm4#raiQj3PsTcgPJ3e0baVhNLNRAD@kOsc1oQmL?NdfYy{h$17Qb`-CCZ+yA4uiTI zmk8yn?jR-W6_sxQPT$Dcs)IG?&Q%AL%aJ!KCAV+`O8x)kvxLO>bGH;K$jDXJM~+r| z!P|h}=5yCQv2l@p$y(DwY`gvD#LQ!F5RUAN`u@e0$i5nooW$ozzA3K2Vp?^Ng!_u) zez*VgVf@b@4EL?U{SR|uj zvp8;m=d*a6{rZMKeErK1wEi!j#q0=g9in)k8QceU(cSHZA1p#eHnZH&2XtSBC*m#8 zXLQ~<`g^m%FN2=&EA2HQ_AaNw z1Dm@ewjC*iALozo`&T{0zEfrMO?b=!qjlD+-YW`-(fiihe`9aKUr8$ub5~rk=VDKO z=UEf1cq*Y(sagoHYW=G3o3O;@6N0VYE1~quYssLh2~M9LmQS*XfCCu6$iv1NEnm7s zqkp>%<}LZ#3%QCTPOb@$jz3pN7L<66_)faOM&)CVQ}wk0`tI_`i+s#ac8`gJ%t4HQ z%T4#*-@pv{x|}u@m68Un?UrKxHo9TDHh);gB(HDn_wcvN*!zz#m3qto4UI9lrEc8> ziQ0y5@3{nkQv5|{E5pWT^p9%eOQs1yVgy3u1j6H7zX#<^)nuyVc$R zSBv|0FgShc(^=%F8@O}&nf=9n6PQn9@B2<`KO$oOp`la&3Ue@}(=jFu~;+yU894P^niM;;5+r{?U?4inK4o1C8s)j(YQglGYsFYl*G!W83|Z zV@s{@UmBZS_t7rs^C6sUuJ8hO`I@I2N(@1ftJ|7$j}OD%--9fQLH+O-bN^}D$U^M; zw1y10I3s8LeGW|r6~oc$`o_|}a_V)H_A&HAWc?NnSvEX`UP*8wM=XXqpD=^@AA1VV z-#5Ng1#>RBGthj@gBT5GQ#~mP6=B=Clji(gEDpWteWGED5=?#bgo}cZ*E~eZjWh>v zfA(1|XE3I3qh(d`?&}txs@3j+19g1Vp6$W%@QS_G4k0n{gbJU<$mKX7lk~b==_NJK z#0Yl*SYD3(QA`QDivPHnfrHyygG}@k zp}#HSYs%sYLG%-2q$mSgWv@k7+mu5N!r?8x!Q!Nv)UetBF}`{*W%`1AmEJQ7*{hN3njl3~$k0l2Bi`Q_q1 zH#F%5a~St8dvx2$VBJv;%qPYal`8E?1UvYh+x4;WMD?!PVz-tEp*5_E`TBPOaj}Z1s{9rj?gA&wMJys*kk;UI*4%e_c29vT^ zj#XAVa1lTBeQ_iM^hIh{#9@6hJw)rX9|NX{u?ycQ&w|Fuk+utl7jpg*Hx!R^k6Rr? zT3KwEH(!cS@0&HQH8$#0TJ)kVdYO^*Mt*dGShn;gYB|&m<}63ibsRc}{XbYi<}w}9 zb5!uD)U6oo_-j_`3MHxQ1v?qaA^mpJ>1USmsOJ@2X`_M8V81x}Dj{<@$AGkN8C=1h zSEn>TG;(eIb^L-RSU9)YKK9idnmkQ?Q*Qs4xbp+SzSVV+d{C5Ec-smu8hM`Up@+UF zFx5D-vOO~byABCf@>Wzo6tRBxPnZvpsKNH;?~*87IB;jz-$bxn?ZLQ)eeY4vquPub z7YU}fmsP08XZYrp9?P%F4P zN*s85{YosK3`!iGIPvJzPvZF>M;Uh(*g^U(&adia(I6vZ$7=Wjl|Lt&8IyrjZLLn$oz*9Y4hlJGZ`nr4o0zW&t){=_jS`F{lBb0?ciQ5J8oOZU7`@> zeqR&~PReFokv@uEbTtrM{4R$I-k#I3#6tIoO6v%B=Wb*@G;={AqLrNXlQuUw>2zVcjq=c^>I4!h0t%I=$3aLsPwnHv7_o*JPa8A58`!6gX5J18pxs|jy)KNL zC79nB_QR81H8AI3#d$e^oQ5@>6FE!46qWzq{o*w6jGe z1fF)*w><*6N!bo@MO%FhYVSogsr$42ZQ?>bo<;R@j%b;WmHtRr8!-Cl0cC$|jU(Ya zt2y=mgy;0s?{zH0P76i~qjqz#S4Ts8p6nl<=0TuX(%r=Kh6}(xr*H9{pzdG#yw@8e zZ2921!cSB_Z}8l^$5zN1&u;~Di!xZ*;5F(el?z_)U%d6_KOeXD5ld`23>mJZ;xYt+ ziK#mE@8Hr>4OI5y`x(LUP@tszq*{6-0Is~*WB%nm#E}+n(6E&YaoSTifN{w(S}cJn8tugq%eEf2-Eu5Oqor%#=reV%HajSfHyLvSA&U z>&c(b2yO8V-UL-sa`QT{<+ydZ-|t2{9R7zJ6n3$E+gkre1m=fyZd6XS#D_Xwgi0@Y z^dh|WtZU*Fcz)xGxR}jJC~{Dy_~D~2BJ-waql@ndIX-`Wx7G7^MBG>W|MR>3KOaVH zGh6;(*lBF}*5G~$xX$Ga2*+3-vIBs#rwgiw-L2Z0q4)+ye^z?fY-rr zeD;61!GHNIc3fL=R=h88^GD%x>iII9xT1k}x_YlqMq=-or07@I%;Uh%4N(a%8Bb*H zVN-(sZhN$gH+-kGlm|j{D!W0Fr9AfbQwuoxY<_&qaZ{h zYTAGHWd?N8RP^4slnCyeJMkjK%LfEE#78IdM`O!3+3r&jMe?m)#4ehKU=kzT!qRln z$kU@Gi(-@EK$OR#&dewRXdSPWeSI??IF3H$Ox;KW^7oRni-rq<^@((%TVE|0_}=mJ z_qWHuGRn$(mst#~JpQLXG0Xt%9<*4#O-us`T>8tLWf(qi5YSJESfiB(ZrnS3xeDah z>YjK;UqL;8^L>Ge$g6{ol4B2};En3!xhsdu;b|;CxNl;E)KCoA!g?A(u7(39*|!~O zi}f5~Qqw|D0`aMaY$s566WIO^@`bW_nXixN>I3e8iro*M$skvc8{Z*hI-yUyC7!>| zOaqELK7MvSoB=-u)|N*a`eWCP(EV9To_fE}gbC;&#+~`)gE5%SFSAH0AI7`WkXqTH zQ~yQ%^~(*Sybb&^Z%u*0 z)7p}g{$f}jC&@|3Qp5aa6lhMK)W-7d%D&)-2smjk5bl5w-zFb%WKVuoTy?7{LYfR~fx_2NR(*fXrV$Z}XW|#v?x7NP*pXK4d;~cksgdYN-M^^aMoQuKbKYboW>%~CD*3(r;qXu-2 zAEb9ws0SsALSMJ9w}2{*JC@TDnBTH_4DWWSX0S$hYsWE1eXk!pAmL+x8lp_cbAMFB zkiPmJgICo00iiyGgh zwh^S$3`~*h#Or&}{cW(dvsir0$_ycJ97~lo=R`Bhg`LXTli=NFVoyFmqiFS})3spI1ugB=A&<-%<@l|twvv~2pwq6VDat$3YYHjO%tgmvSs_vFlFvvAZg z{mqR3{t}pLSY5>vPXTH6uk(62Fx+j=%kx*O&tbU0_+`x?tp9}jQ@j6Z9gL?i)n(e? z4p7tR=xt6>;A?7ktwHGxC`QN0?Q_?LdcLN6=eOQRQ}x8H_$~Q6(GQ&}Jo04bYa}pB zH(SzDF92gc0+l^(w|z9PHL0>#24hrtm|>^xeLJT)C`8mwpZLmOE- zMIS)(B?7RuKU>p%ssxRbUcKT@4u&?{DyF56xq=3>!ZejWOAzVzS&W2x26*2s-VcWNDd7BBoY#f(4e&Y`j?dz_0iMrd=ereWpZ@%F;-BaXIJtTz!u^d6 z%AWp9-~Ou+T9L~k5j-0Q7+8NPhkw2YzZypSMj%4nUr0|V!?^@Rl$*Zx8m)D&p zC6#1@F8d335idr5@TO1tLkvbO9#ZI@5F>*Q3sRD_$V{wZzYeM=QPfKxX^;Cm5Pj=h zt%?&n$)?t$8I5-jW6vknSFblmWO?VleNXFN^rPV8v&V}pA%$Po-Jeqc;Wj(=ht$r3 zW{rM)aDvYlzRu%4oH%d|tu2ODySNldI^P+@2t*T#^9~I;KVK1|$v#aE3B6-v>x4%Y z;%gO>gH_Dd>UxVL*54XmzCe07O5Zh9_WKnv!dE0~aJrVfy%BrvF4Y}Nsx zomWMf`aNK8uQTZ}w-gc>i30O|;^bH>$5} znzABU`MZsIgJkG@Bz;q5lOoq{sKsL1NSxzsgcBppm+rI*D5O2^0xw%dT5_b*D z*t-d-k}dz}+sS$DK<}Sx$*Y&0C8r+ebfQpd;K8kCAxox&) zrd-%}(~EDPH8Ee0=|=|*3O-dqqgugiLo<7{HOf%<# zj6__X zu=T;$ZfYNidRhFan<76-_w>R}PSFd^n!+l-<34M*805SfyWb=(c`}DrSZCo1xg(U3 zS_h-3%^yzCdaX-b&=IOOC|e;LT~wgvvydb1=u`|V0hz5bQ8c2zE5-F?HthGE{1)27 zWzFeFEhMwudb!!n1Di*-J3N7nc9JxEtHxOi{l1(K8wfM{F^I+ zu6IY_Kv!Q4yImxB*%o+!SNs4nHOBpG_Qg%~+Ul=mUNH|OfB#HClRJW%uF1%EPNlcdky zJ`#*1T>NA(++&R7^}My`OXfhfMR%L7)Jh}jvVtRb+K)FA`#N_mH|}W;rQzCRvFO*_ zIHnep)w9+-@2z>O{Bk^5v6o-@QXT<)zHT89Oq4~P+!-YvT@xqvSll10+mTNqQU1k< z$yk%`dhO}TF@nTj4;{0+Eck($(@nyso$KVuc(L;a`8-HWY~+6NE=Lf#y^y4!K2xEy5BNOrK{pnI`eN!)SaJxkePOY+3= zAKSE_*~48P`VMpS{@_GC&CEiM6uMF2?)CVM2yzGUh)yVEK#SSSnWby9NbeR`#H}0~ zDE13$o^C~C(u{^=QBS}P@{oBv-`9|v#Ox9OolpMpl3ga0>BmiF$cKNZo;!745~%F@ ztEhKI2d2<#L>vYAl#Ou;@k0w`#$o{1@(p8DAWZhYw=QYWv(An^|Yd3bw zpceEmgCsCKl}_aI6(H%3ib&q0Rmr=GB$|~)L|Y|*o}b)Si#EH*cSWbb((su%z9 zNS(_ENb(^X4F#|+Xk3ij_phKmqy09T;^;r9~x`Bh~f$|9m;tw=en~& zHH;r`G@_k|f@$gZ;;$AeB0Cm33Rsm*5nIKV^rB3%=%>qf8A*c0uzT=C=fqqN)I7_r zBqw5yJh&C${`r(2q8cArIul&v;DVR~m%JJ#s5_tyi-w+uRIM~XpR8#&u8y9_=&dCX5N z+7h`rvft}o!4))J^Y2;^y(PNK`H02aR5i4<;;#Ou&^p*t5D?b(I|<%!pEz^dv<4*K zK6ng@J44S{yHh$v;;2VPmd-Sz0J=x}fw1OB83f&HKbcJOLt|J24O9er{V^{(aEl#?F2J=r*vp5&c^=y z(ZuwnK#Dx;U2p%zruntmxceMum}I&IUrwiseRJ5JHcU_U6{U9ngR}qM43M-P$(4bI zasTrb$Ng?`A4c3C4EL?U{S*iyim%E-4gZ6(!4aj_ZFi3Uz{CWjgZ{d z`3x3V9{-ofsgHE~<^*@x@u3MHx1}|o3q_uZesMpz=mNI$XJlxzmBDvctEYraGC}e2 zx#VZi1KD?cM7Vtb`%b&g-~N5-8lu%eKXx_S6}c%A(Jg>HZn=$c8ypU9Xqjwp=+dtK z+cLjTMezJSBTC=sPOo2Ve<&+Uzu(pP=|DLtUl{>!M<}@bL!ZIBn^Y9KQ(vlFgSevo z){?1em_pKt+yBr}4)9JTw7z@!m>haSNxLsr6|SB9YN0de2c$@I<)n>Wh>=6z-KZfy z=>w=c^U6XiGR0{cFTK&vy}Y3RrEpa4D6eS(D@ z()~$aH;<=9*7_fuw{M_B%#2KhxY<3Sn`<=RQdBhHYGE|j33G!tW6Z@i7}bft&ui!w z{q!SWeG_teru8P#(r%Z=WX}MFM!>G;@|BOIC(iXL3>pKJ5ca)SGGPs=^!L{TPHI2N zEqgC1X|8`Gnok%p8>ajqj^9+=@qULEB-|iea{l8EuHS!@+e&K*O^5Rp=T>EbA+fbt zxH+D9Xl#+j$Ipi-m~J@3q4kJ7wS zdX@av{v`tduP)4N7EM#e=rcrwMf`xvJj!bM+7GbnGm(0`BS8nJwq2%e8ZZc1RY{8| z1bp8^o~{+;gR@3&e6RK~8UnAS zW?niCYo9Zj>-m+#Tdqs{xG;UiU22wpdnYv!o&8A+PS(O`%Fv3@J|SboY()5~OIQgE z+|7M}{zEqWStW2f?^Y$yI^}C5ARh)qG>%NWV8{FMaO0y*!~>NMYN#0M+>3IxeU13b z`w+ep8Sbfx%7QhsacbqyvGv~A)oe7^fWcUH1JJLJHaPuaQ!dd*l8$AeUZJMw#5c>F zC;?H#x=iG5ZEqn|1Fn+I_78z{hhVs$LNT~TM^4fzzYlkOzufaAAFGEk@axiwm$yXG zD8^abdqTNG^N;J~oiorNahWE&QUctnwzZw+O_lPOF)a{2DJB_t^OoYEX==-mfa{K{ zKkt(WLA9)7zuAdx{FkYG1B$GD615Hnr1uSs(w!Y21{|s)c zcKSsv*_-(ABuR~Q^D%J{xWZCih=74an7-BHeM?eR*tchH`Kwc_({WFNtuQwyewN9wl!w`;VH} z5V`YiypWUnHYCORYfnn+I$681>*4)sOwYV9TB>c-3*OoGVKjAI2aqe@3exlQ!VRTR z&?z1pczKr`;s68%iLv0%|MXCIr@$oSk{`_9b)tu`y9vG+ z$$G=mWr&{o7A_K4RtOJSq2^9U3t;AvV*jZ-< zZ+l^lc=Ak~+}*5+%B-|G$xSsu+5RSV+I=P9U%GLB&56e#uUE(D-KrIG(m_|?8K)Mi zIeW$C7AS*rNt)v2G$lZ9P>G)QU@Bm}%Vl+Hm52s)lzOYs>LHFxccPB(lSNpsMI6w% z9t_zwRgNF?e+VBNd+oWk-CfV$MfM zN;@9&D((zxCZvUL#o7BVbw`iZd~3O=bTqt`cSI_r%iFO3OFB_DFy-y;-V}07XkX-; z+D#H4N#@)Yo86Gypw!q>xJJT##c{t|+=mhO2g7}9a6biH=Ns3b#dQ&Jy)Ilw2G=*h z`)=`mFuYFz=g;E2E}U7Y^y~j|_2uDIgzf)FA%#-bgcQkIvUA3HYzf)P7G;k@vdebtOO!no z*-G}Tku%OiiIO&nNJ2qn9;%d07|Tc2RjufbkxfQ~p-@`_^qByR%ZmiJ=;v z*tKmr^{^jUte!afvi*AN>&7D+?3~%HH(&gyjsH8`n$dGGhcjK3+CMdA@xA3UHCcc1 zbA}@3H+ss`%B|ZMe$2^#f3dKL#1W*SE_+R%SfW1U**@P&O;kG|GJTUD9F)2@TC0x| z#lIhtpueI9H+kNOKPu1zN}1~p0;^2Ovz0t`qpf6kKv+z_^t1p}yRkE;6dEdOS~}I_I}K%ld7PW z*7wG?gEW_$|3y6dF7@fjO9CO!gGkRjx};b&uzWN+uFuq>je;J25qmc03vq<|cM);7 zGc2h-cuNOjahAn=x_G-Z49eMI5O|gmTvTIYxxnH@aW>;JzngTOc(Z8Yscc0VWz&0( z)quW+*giB7yz+__#vYxycj5;dIJNoa;_1y*>OSXRfA6OBk^)!93zPPgQeDE@PkM_WP$?nD0R+mR*lY2YPC zgufOU>n9n;18K=yu<)+;*@rjup^mzjHuq}_q^9nQOvmR?NUGi|+AM?hDNBsgM$D&( zMa*#yW~6kWApgOZN&P;^>AAn0mQVxdYyy|!on25RSI$I5L=8#Yk81rbc7_KsQDC3n z1g1(xbniwsK(#pO#Lzk?l-^H0{_RIQx23ZvlD9c^TB;aazXg77I$i-g+sspql51hL8=-V&q8g0y zCC7<7s3E%QA8)j0hoJT~J@VaO)X>6m651jf3dt9)d>cPh1wLvo1vM!b!ct14+FzAB zAVC7X*@o&Pua#bRy$C#rs@Dx&^R=`<#s}Y;?P)xXGAYtOH7qUwew;iG&1YR;sm^lf zcs5dHRmCb5Z932uo<9?4MsLXEp&`+Mo;59t$FjzySEFP0? zk<n5!a2Dd(OK2?6mt!2?U#IY{#g38UWoB#6JSUH7%ah9c8le>4| zQL9q-A(kTsffVxGoeAZpNMifTqo&D8E7akFGYgluY?3C4#~iwMO;Y*K2@_XbKC9rq z;<(=}?!$=tgW^!tn-p9}KV0 z;&lT&pT*;B-{B8jdXyn}^4DK>cF)Vm3w`xex20%g{=fMw9%qH@oL{Ox?}2VBd^@*9 zq@qhz+vD4xnITtAk8*WJc%wIz^4<&R1i>a&&94l5uY-r%drt16M;IG z%BFd%EAhw4)<50s1r$Buw=)l3QHc@yK@V9tKN0Oe_GSNYVaMViL1vr%Km1D0?Ketj zAZN#`<6WOAyQ1zfJT4+a6rPk5QIaj6$mvKELD5mEKJpWU&(El|J`Iok1rQJ-YW z%ZJ}D5zb%5_7h0#=l|i81obAhkk1Mqy1A9l6(OYy2jB);vs~PtnMu= zAb=8T6b+0MSkM*G)n|hneju~pN0zwKKfICQ=>VO7>w2Tbi(>Oyw=@7GtIKl7%OrMa zcvt&zh=dv_mG>ku=NrNo=IoiO;1MN&-z$;1ag!qTrkwReD>pj1a%9L)ew8@8vi58t zSrG>5ZKTe=4**{{^rg+t(|By5HWvG7csjN>@9L$od1{$LYE z+QPdLNug~AB1I-Cj!bapSg9a#BdoxnVnq|ZCz>lT!YPf~YehUt+D7Y_zb`xZgp6$o zR;AUnzB!@5>yHnLZEKl+EmN+#-gZ0@^$+(ox+|uWg6$`~nfp#83x*z+9(qZ;FK%K{ zmuTZfrClx|+dVfgp8wNH+fSzNr0LNSrfYHnkb-3PO946m@C(B}*S*n`&V!|6=Jh~q zal^^R=ME&u9GQK?O`ErN*@~o%5OUcLhQlDVd!OsOUp^fezBqbs7<&`#yiXW%!)fPT z4N3Mxu>7L%nExEkZ$Ac`$7pSxZa<`7%{qQ+B5i zGDz^xjja^}_=78QPn;gWmj9_U@jN#E?R2~>jV}gwKG((i(d69*EsrCSTl6}KEtsB! z2Ga2*+3-vH;`;(W0G@f7}xpT%)qINkv7gW>gAyl#N!vv`~pF6pI{{QDLv_~}1v z!gQCh&vp9tZ|wPN3zc0D4aM%08;N-~8L2w^><=4L6Z8!$<1hVz@xPGE{N&|qz&Ob$ zQoIz35Srr-C|?8!p`a(6nE~@>dS+cYbR`(^^z}~Mw~#j@%q2Ep9IhVoL5%n zMhR}*B$Yj!XlqBQ=H8V6KoH473BM0w>!kO~yJ&>QoXcn(JWjigAbM+x7Ir?hbVg@W zbd;IE+?Rh7+fSJL>??=qLNggM9aDlGbmv$8d_N8Z;}5Fvul%L(St3Q z#}e0Ui?Dgrei>wM#nyGt`!+ib=lN5Am{S7X{-;YY<0t~y^=?6~u2b+)3R9;byA>#$ z`p&x8^qV-XMTpipx)sqZ)I5FHNf?>PnjCqRyhh=!bxUHj@Bzx?OKox=u0ob~*_&!l zGChX!e{(Fud60&K|vA*mLSXJ^p();?+Ja=(9$GgDe(zqy_Ck6Y20l zaNZkk+8#fZdh!C$Na=0lSP?`lcK_L2@!5;GOzIl`@RvZ+OI74kcI-pHZ2nM9h#Dpe z?7x&JHq3(VbBKF9ueT3bIP*)N;pzc|OFg`}egb>H$!qG5U%Fz~>3#Ud5XL_s5U4hP zJWv8pX597^BcwF%N7A+NN{okhWY=mD4KJN=sA$p`nY;0Tcwwdy`@CvgE@wDm#|h4n zgf1a>OI(7u?$*%un@A^x(9RdGtfq??pZ4-~IE3M`Fq!HYQD5}LDduMJ`^E6c$gqRo zkqj6x)o?`9&m)FuhzqI0wR1@sDSLtu6M~H@D&qs9}S>OXLuIbBCgt7k-8}An(5gbGcSU5aV+! zGo_z4l1^6S&Rg5ZVB<`a%|a)M59-eBc~#AZeg3ZfS5gMC{RKtkZggmUfs zbby2%FYVdC`iw##e_P4Yp~G;GUhi+;BcLAnCO-xP=)pktteg@z8&sC#EO#^dL&AN< zalc#KhY|M&!+mRTKLzah`ls`aZU5=d;<|{qUKg$-gX56Qnc{P+?Q_$F_T zFO3L8vU}-bi3n`n;n{Gq{sg>w*^=KU))H2Sx&1yC5(q^S)>h}e)8MtBx8|R=Wx&)a zBMp%53Q8tIp+w3-B;5Fr_K)Y@Ft6?w-DK+(%x6%RZ{7kye4J&sO^;^4$TCSY9qt0! z{T5VWr$ec{35&4}w_xw)J2AGh9NKm5ym|vn# z0ql-nLKaW^K1(BY{E<>$}hrrEwagex&_B~1vtQ9(m6sYh0 zX75#k#hK=w@)V3u;`)jz#O;f|$lHEnnaKn&@IHQ>K`e!CCbh2LEpB4-6O_1l^l0~| z{rNTx)V1Amhto&Qf5B08Aeq<#Eoagc>MH7J<5IsS(DG9PQ?2p^q@(q3RnO2!Oq_9{a2qIUOx7( zZh#&4tK}0T?Yj6bz%58=7L$)B*_)Qz(n<*rp3x_-HM~MWs!6uFB zY0MAa%;jZdA93nx5K#v30^C;|_q)Y?7;%3v+_whzQ^0k;as63b7ZKO%!gXYDeFL0# zi}S&-_x&GF0mskcxGo%TfcL@h`Yc{I_+LKz-#E(_!{B^esTVe|E$^Tt27{TGqt=nE z#egtUx#1{cf+7NeckEl-5w$xXiszRvq4z~QA8s!w1fXJL$B1%0JhbPV1Ba_Clr|lk zP1zj}R{Z4oiBd$swBGl0`5h5-!I!00`X@lsjz7=h*en73AVyqM4+2rkflqtmt^;UQ=l>B8XKkLW^aA}x5L5leT{GfH8{d9yj zFv@-W;-bYMrVmD{6z%y*A*h9uTlobLLQT{3!6%2&!z+cK>_{bGhsX={sEj@HFN)Ub8`c7uz;~qrF_;>GTZ3OAN;c9!MRSO-?GUR=6?J{^d zW3TgSBLey*WLb~Uodw?}JfzwT1i|sVFiv}AA;@PU9Ktr=0@P!j`y;PX0LdZy#+=C` zP#BPwJzvlXf^){}id4G6lK6cE!sQNdHWvrNMN3e-!%Iv`XoBdIwBDD;s{{KBgCd(L2x62cwT{Egw%Sw!_e1 zy_>Jf;%WPfjw+g?R-dH~i7VfSF-w}$OvQePeP6%Br_<7C|JS+corfx+a2t0Cz13YH z<)k2UjxQU#Kjyn6ZimzEPrr}x3Pp!OOrK7-1^%T($tM->o{hvS%$K>K(DW5ErV~Wozxr&!9wB(|zUtYJ#oNrgt})A| z2;c57W}&U7Am^OPbotx6Kz{W8fRgw%=&pHp$>WzDZT$iIMl_zJT4w~7Ph(#nT%%%M zXqSk(c8ls2%+FBjlXhDbU_ZpHdt_TWRJQJviW$0!t^cA%ayl)pv+>vUYGB*drC>Ez z?07k^k*-t7r~U7hwx}e7yE9@9Exl=gJAdDvfs`w7=Z9@|FK^1gdiQ4(@znshHh6$7 zL+>l;*?Rkk)}jtzY~_m*--+>K2!3NZAFc!DeH(M_wM$fYx?64rg=v(vhcClSjFw4z zPS5P`veLxzF^`q~f5s?IJqwjbvW|ny{ipea#S&@P)tUKd3;OouK)JY2Dj4|kJ?*ep z3sHcFq3C-%g<|oJ?H}I2kgEN!54Qi0Ms93 zUvb>;7WZMq{lReG8r)9-*ZIcvXK`IbT(1k)k-_y1aNaG>2g7*^IDQt#b>Vn}|Jw)q zU)|up`79o1_YBAvTyg7zM!Fw7`;HZYT!Gqyl|KSe>p=rC{}OL>qlrC99MhG4?qK2a zWVH$u#}}utBo={r&Y$weRrT<Bhxy$=@Da_lv0+Xs0W7Y@wH^gey6?T?R$xuL@7}8S^^v$&XGt&Giy_@Dxo??E zVYL0Cvcq=B-+m6dFv$wY5*~k2GN_N(xJ7Eec@7)my31blhrc!y2N5?N7o-5`X!mW+ z+GQ#xo1~x8H%B1^eC@O{kU(ZiotIX$lPq4Ase(->%A#|f)2Z0!7mHe?AUW@T%V?^g3W{M6v{b{ zsCst%Oudi;1AbCeH5xrZdwfms`XE2N6j9{8nPbt*@( z6?!-P{GB6B%g=g7ih073PbV+D6XALQKjhs?KJ*F8PYHzXU2DOJ_JL--e$j`}_Q3}Q zoy2U~=P4?1GXP0?_*l#Cc{#jc+N32=LDM55lotC3A*0KjeVTk}*x!+TuaB#uge{-Z zhum0wH!`x_vg8`1_jViP5X=E(Ou}mZLou}D3N5w$Y4?Yh*YzT@UectxH?amXI#~`p zue=RRh3QP&8>4CC+*tBsY2)rp{PIS6ojCUWnZx`li$)xMZUsY*M7g`C_M2ei7Rw&a z(9Y-4a^a2#f|ZvEjV8Wm-V3J!UfocrA3X`JN3Mg4&~QFC<`Af?oz7wUG8JIwRU<6? zt8eF&?sB2cLw{T7B0B%EY@Dg62zHF?{xF}d0NLYQh2+H3aOWN!y60=9wBy(;l4$b~ z;#eiI-<^cE>hP@&w7)ajnL@FAec8z3Glomv?#Tc{)4)8hIsXy)D5!nnyuuNocwqmQ ze&v({9tLtB=rRUXS!!7Y>K*g%;X>5;CHp$zdjh2Cl22aT&2}DhG+W6 zh!&szlh2;v@%|TQ-(ePCwXy1$u95OL_rQK4q3td7%OAnUKM(W_?I4=%SbF~G$`|UL zt>=umR)m1~^EJj8D*yj{#c{t|+=mhO2g7}9a6biH=Ns3b#dQ&Jy)Ilw=70JI1j0Yw zEzSqSc?vjw7RPn{-#!>#pT+A2cs`5A**HcCai*C*NVpa(HSr+|C9n=k2fo^ewox!) zn(+YinRWMaXL}+XZeF$Rzh3~NEoYsH5qAJbTjJmjhGy968Tn3^bxOtNaukW$Fsib1 zK{6o6m5Zp&eJl9wc4OkrE`k6r*4HFdzNqU<6TeCLs(L(0vwz8MxW>RMQ$fPm- zVQNXceCKDz2Smbpz&<6{Rmvc$u4uubjODo>yQB_rU^+|X?^i0tQ9*^&(kJgdvG3um zi9vS*?0GDx7+*SoHjXfgFz5wgdCBi5v%(jUaH$jC+mFR#L55pwOOZtdsRea(rBS>=JIu-min)nqRUg1jpJV& zHA-tKS#154lm$KTz(TQIRg;YI_!aa;&9L{1d`14$1wl;D>8{hk%j_tTR9%l2Kc;e& z-V1*oG*3#p^4I0t0RlX-ZN$cg^qBb7MbGqx1spq;h&L_; zmmYNJB!$Orjh7YN3iEE5c6OtO0kz-$?|UJ6`0}R6@{tSU#Mjqdu3E`1Qs|21(%T%` zD2(@V>PL5W5yK8V)E{BKN0}|!_BsqvL(%PrEuysr(Q>NZ=a?K5BqWRf=Y26#gm9$r z^Up=BUo%=-T7BVP9nyWR+7=-!Ki%i}!xHs9b#msc4W_U7I+d!!s*3doplHjz0QBL* z1BPZtA439RL0yDZ0gEG=OyN&G(E;(Y7KsBs_tD%{=es@{p&BFUG?$6( zU*q7CmPP_+bWE4YRIqOSQ))!J2D*Nrx4FUThO&(rT-wI0iHLDfGMP3)(Jx;wZqt(K z0D?I@8VPkR*zw#>{t{SScvL=yRS&D<8Xx_8b^J2Qyje7~YF_{@g-g(Z`Fq%Vu1dZa zLF?C%#l%`fu&tVFN%~o|#h6}5B)bXvt%LogTFqep^&blX!3mHc#qWH-9IHq4{JryA z8Pk(F)v4LrgY~QAN*JHlg&}*_c5Dk7&4N7#Z(F7F*MdQY-}#N+cd_*_%Sv`(^Qb+0 zdYR)AcD*WB7uIO~GlmBP10hJCZO8bc{B2rtr5Wx((kixz_@rpxn< z(%Esx3;SH}d7h(p#PaU?^dmd4=dVray{)q)LdY4@v-G)yBIvP}@z!A#tbZix*_4UZ zrE-4e&+NnI>lPR4kVhIb1=B~hMq5r7W&fRdx7Io!-(1d2_^3ii?M2oU5-D@RzU!1z zEtno-eR*(eJdvPwyeo>A2emj`618g|78ePjbf2dVB8R4wk`5dfBc<~0Tz%|wl=`U3 zBTi>wUWHQnG(DkZCxvql;oH0Fb(Pij0$Lx8#G>=BK8qdShJJpZfvu~zk<0cSg;4YD zNk$fhsB7~$a`#j`)%n=5fKASuB!hMO%g+TesDuKQZyTGs6oRRZufLW%r8)J3=sjIW z;!}}1d6^hv3hpb8``zL`jJQ7-?puTVDd0Naxc)4zi-_xW;W{$7z5&j=#ra@3PXWiz z;D>nMnYmGJhRTdpn!t| zuKg}Qi9XqX_CLdPRCHXGVt$DaH>f&o!8j2O8bdYw|_pSvFJ=>v7V zR#?Zfu7QFXR{g;xOSH}G_+~FX=06bJd=1L&Mz1Smgk;ixBmHF->*0UI3iD6*<#5Gt zfxDOA&W-5zQQnSNW=&cQ5!1R-gqm3!iO;x(hyQZcP(*FF@yM|0fKP|$3ljn^!LM7% zO>4qrkPRBBLj)ys+DUuow5~K#6ZLuIb{hwp7k+%tn|GT;Gp8*dYx4+@O?pcWr-vjs zxT?+Ldh9rq9e%Lsm7xQqm(E3z2YylXq%J8c+HOVSB_3S=mW|23Z>A0tIR`2Iiev>3 zjvC57jz7Duiu|T1pJu+6cwQQ5$z6{*q`U_qtMJdTV7e+Cu?N`?Z^80Cii8^L6D*GT zq`&_p6pHfle!I97at=`*QC=BiYXGA*tI!&5_f00ikBd|NDJRpPNl8+G3A8BHpnl{RANi5Om6tWvecrpQUBl z-ZDLa)GLQKc7Ke61ConZ_=UWYeT(dBT6Nlp&Y;;FX%9EV%_7$A-fSCu*RTEAq^}IJ zt=*A$$Lfuc#ShUj*V4iwcvy#Dy&lQvdjzXUv~EJ zOeVjk2>GZQ1}czI*Nb6|Cz5KxasLxy8HXCcT74QFix;LpSxT&QJ7KvxNaidH2BIxHj(RdZyAKblobMNro`2PWBr#3o{lYiGvOx%Ps5&`X?#{OsvWHw7!--}J!MnM- zS(FR&f8EDvl&FE0bWANotk0|HdtbOtc*;mxQ$BLUYIw25_yLcWuHki}%$M$|gIx9r=Rc~Xp_15a zavU9ncAG*-U)P--rh zJdq@u63={p?_9Iwg5tj7|MR>3uMZ>c4~F~J;C>3Y&Nr?KCnxBnkrwk`@`+O2<)&dZaNuN-gES$sEs{?;BLRX65ulA0HFRjbZk$3erdPJ90!WSIA;c!sAIoF!jGygL#PS1XhaKt)NbsN3 zHM0xnvHecp;xkVnPhPzf&bApOwHA%-GGCseI+$%YIy_rOd}Il~`i^H)R*K{t%WJMs z0*xQ0Qyo;mH~;NM6D~T?RDdyG#*Yr+5Uo3KBYKpmak{QSVt#}|8e}LQOAP@Y2Hiut zQP}TS`|-Z=m$I1d|7=sl4XlsGQ_Y!jR~{{TL_G z41m%n?j3Itl||Zwm3>K*iU|49nM=oB3!{W@cE@}@+9=eeXAf5-_5tJipGEr_h%i{7 zO7YuMUEqSG*Z-huK+no0L))+oV!U7CsWW+WCPITyiy6s{-3?SEeL-p`}v#?I~KB_qEG$cms}#I=jpw>@}3rkQ45C})2QXS3^uNe`_^%5w65e%t5;tpAP7dj`LU{wB8<^u+cT2iZ=ZK{%Dp{+2yKhKj19 zf2Z7U1HSjAIz^MFi2cHIn;=_Glxfcs+vf%*$Z0L*awmHaxa;(%y`hdZfIxm2k=H4J zJ#YEG_mYlddXDUkLp}D0MDLA5lSN_RdhTeVR6DI7_wj2LJq=G~nlhnw23t2@^f!KE zB+=bVUa|QqoKZNj@pk)txSpxBJ@#4*AP8LeUh(2I(l&L|keX|T5(e|8)R(b79ofF< zT-{N$ZTQCCuSf~>u;gcZy7T~gvL}pRzhQ;t(JZPPf3du{z&s)+1?#7^^(vGc#qv~w z`Hl4`euSAk6?v+=9Hb1tWm0FU15^yv_xfAmqpG#^vCEtYpkSHuQ*+M%NQpQ^g;XH^K~IQ|!&=~y0_d*42Yrt{grpuEP0 zJlYal)%9e&r98I(Gq1;=*2~*(U2i5%s1VM`6%?tZQ`7@Wo$LoO-h;jKR=c}V*zpC$ zbc*Z5%@)mx^<+6LKTY_n)i_THR!Wx*RhJ><3zuE45R;*X@{34O2}>%20=DzcVmy?n zThoQIQ%uCGy9PQJ75Z-;a@(h zf8FI@oISKAd%rO7DwUA)t=)XknnK>ut{CO%PLZQ~*L#7(m&i1%uTk>YoFb*mvQ);7WZMq{lReG8r)A|?|*f^as63b7ZKO%!gXYDeFL0#i}S&7 zo&t`a#c^FY-T?1|;q_U(Zh+^rc$|G|p2D-l-UpMzbJKbnEfBGsa#wX@S5&xOO?A=8 z7x7*>!TC8g29@U?dg<8^OOpNc!e*gYla;3}-60SzYrDE?%mdoF| zfrp83%CA+yYvKy=KxApZlV>&MJ6yWea&MhvWP^lp}9yJ~qT%CDIC zF0;YL^!Ib3b9HSigM~VX8((wbq@IT74eN6PIgW!i;kICpxMQeVYaU7Fl_v7?!wcpM zneotj=3~AH!JhQ*j^Co3?$L0h7Z|y`@X7^){t6x*>h)Pl8>EB5< zW;LU_(??+I$kV^cc0xdO_H`P$fFAC7R;t0s%!A}H1#Efo<}fO=z@BwO^C(g`bT-MS z&>!A+tQPCK7Y-Y0Zlsi%go2ySAC21>9iXj)*2T*%O_aBu*ZnGXzNFZe)%H}}UL^W{ z@b3`XIzjn8J+psTlNMb3J#EfmV+>7~wI9g8Favg9B^Cu=sl)nwI;R&M`@xPp#yydL z%3-a{0A^04|@XD_=78Lk=94eL`my4W0}!uAjqvq{$QG{%MZ_k+sPsC&B1EApN9jH{1*Z z1ji@E*&2R`XGzehxQcYpg_3qGJXQyUzv@zo1aIWaH61?U*C0@xvBuk@=83i(h~>BA zutKb}TZ9kvSA%Pg?>I~2&I68|^)XrQJlJ+JG;0OZnQfciXSB7&9=R;0=C^xK6L=u- zWT+Sh!FU%+{Cb`XAXx40i0Z{~kT37V@3jj+_V^8pJ`*bi=jn8|-bn}nmu|=BpQ_3R zci2~Q?~j?IYD`ZzqAHEh^-Ia&&zc=jdy7Bn)cD)r$8OFCaJ~+nyLJ4F=+_iTAGzEg z9G?LR=`yL|cLNbZhxNl-HrVe0w)xE8vgU}fWhRj&x)2Dd-}`RqUk{zx$Bada?m+X~ z9#{QuS3 z-YWX;5m(jo!9>vY*m$}kAP~LzOhTa&ICfb*v)d?Yo!d5i`pDf0DiXlRB;;!hd8`}9 zUA~@&@1`EiD{;_MjSW5Do5tO1`A)AY|EKFu>;Ai85#`5kk_cV`?Lyz}DZzzb^W8Y! zQ7cxx84I?m0m5s`+O_xJiM+G*T#4hSp@z8q=eNtVDl%P%rtdEfw3NKO_ptPe2KB}n z#ZM{^cT!KE&_897qCg_#TvXFtjHM8QSne@wWDt?*KpNfvmir!}^})c+>UmmymP8-D znr~gBPSJ7T&u?ZrL2M&m=hpL6ro5GXs_oXvSh#08_bWAr@<_07 zVNmYsAJW4H7O!O3&wsoBgB*COR_k z^z_@97(Es6OtPu86XWAvtWqqOp{$VzN{wyF^Pez3p@^*gniL9w!Hz6HFN)l)ZFF~( zO@w?|I*Ff-q=B8=ggL9de4*v;QBjUc6<~ABoU(Jf10^V`)8KCT7h-a!X5Y|Z7Bup5 zB64hs6Or*rPUGq}fdO3QU0P)>z-Rw~x15`1FuV1)(%!xBP+kF$JE7lmUn3b%D~vg| z^kR6DFDtgP(njbJ^!7(-vo`Q++jKU^u^ZTZ$skD@DkIK(iEnQ8w*mRrQx@K0SiVc> z_l}u6g=in;txu|{260Ya7V)_66y{zQV*Wdkw4GyI$vY_GLxa z`EIE&ClrDiraXs{5ftw0>P>AQ&&Tq|UDOOZXRKbUm-y5EB%(Iobnb^yF<4;XoXN;~ z40oq(jp%8m;lV_+PTA3L01f*SOH7b^wHk$SjImI4s;1>)PbuVYL*jy>iXb7>Jm8b9 zHbU4}L|>(1f~b9dJ<~7fgy>2HeNi;3hv$)|qNmp?q1n*vuek0S;J&3w{)}K1wtn;K zt*Nv=*JH@mNXwfuee@PhkjsBRq=#G8!}olM8{vH;SJjeb77S`TL39pW$VO9$`Y zbG;T4iGU9;aY=8rE`@iEE?*AxK0)1^mN1@>vkm%F-@KvJS-|DDzjB;)v>{)>?ZwF? zO%g-on}LReWfjo_1B1y@TGR!F@UULgkh6GGL-{|BgHV0Gd^%&1kXCyh-^;rT*8Vm2PZQtig zB}flnrdGCL`Sqihr%pT~7V^BB^Eti`>MB{&@CLxQ)P>dugZDdkbPS#HArd5c{|;7R z_+jz>4<%w=M9bsV|KjYE+_muiyWWt#g?!21HTR0Tzrpou|DI;ja6Aj>`Dg=fUa%5q zIui;S{!EX{wuXc4j?0xFy!`;~D~|i!;y#SHKN#*?gZnAqI^Ved?EiET|Et%9>&W2x z1~~5)=Y!!q1sp$%$7;>0MBRfI7{?P)4Sc?4~0z`?YL7CK<>^cxkH<& zK=#?A%x48;(u}!~BRAIjnh}mXVbWPZx|vs(^yb7H(oS2$_xfZhpjN#jX4|YMXgqt? z+$Syue6^0@;O!*9-f29e;R>%DH1`n7z6`gGnaXB{Ev~gmeH9o>CucILaVFHb3<`E7QQ~`nV%y zdV$2KNViXa?HVW!{nLUlh5-Cx&Tz8JuTVkby+H@$IVqZA`9Za{s{ z`#j7i*S_ccpVa~CH$TZ=Be}cLZ-W9-YawDt-DxL=sbx9Tz;9V~b@K|a8_u%scME|d zzfMo+T{i~LCcoJ(jU9upRht5Ks@^4T-+Oz>qh*wmyTBl6&P9hDUFt#ZEPOm=Y;H;JSB+?E2P4&d|r}t_2BZ?roCtU>%oqh zW3rD!TL3xRqjpDd8|aFx_NP~G1EThi|K1p^1sA<`%AWPe1=QO}XlGX&l)Jf@(j9Oa zHCXNBP025SqgE1mmJ3)vtbFD6PF`0uUAViuqvbw~EpeAPse|E*Y6dx251c~uqq`Ws z%oM`zYagEr(4T??>u4yF_^l)z)I=jaEbdJp|g_;;mc>3Da3z_gC}H*9)$6f zZ!VP91zbd#s0F|K4M?cW5`1EE z3MIt+JeoRo3?Xczh6l8&A)`h6_>P@-Liu0cxk&%U3l@YnN`4(I0Yg68nj$)d&>}}` z_Zo(a_@h)5@in~&Yzk_&v1x0NbS;k_wzFme(^c);#XX&X_ST&>)Lj9{W+L6dV zPRZQ4&_E=(9dN(%1*Oaw&vLXhi-AOkTr~Z&9IU#(Lc<$?6=s5K;H(SfAm2xdZ`mbG z1!T6c;)FBB{jt)&eD+Dqf8uPEodD-qt1i-%jO`y+qiU+!pwhVykIsQ(Iqcr7(+R+H zDxX2<@D-5HQY2_D9R;q~kY&vzVgc?ej{Du>K8(0O817qx`zhc$-?;uPu8WB4b>TWP zxV{0-yT$onI8On`&*Hc)9B+X4!SMPlUN^w=Sv<}jX3w~JM!g?il^d0?_Q?i@oF@qO zJ5N)$a2l!Ib~;Tx-o)W!ek_C<9XT?2Qnr%HPWUzaHG&!LZ)s<7vNMC?#VlMl?Y?m7 zUC6xx^S7jWMiG{f^BKfy_>lU&?*~v`kokA8TM^t&=fx2Hk!FH zMZ)S>cN7Cn_9DxF>oBlG6#UqlmnKz-#bNTvX}VpyNYgconH=#%IGz-CRh7$_GWFDm zteu!lj7~N<-?jG>^(JLStbX|!sq)U0$geaR{am`n_e<@(k!+?aifqZP*6ZTn&1#VU;Si}115`X<42#qJHkRux!#>5*?l10!vL=p4*1i#dPG#P^W6@*Op^pcv{pK3sQm@~a2;T=#aI1o$Ogt67 z<;xp#S$zaqcz$?`ggk=A2L#aPrS%Z~`&`eVEDz?`%yJhAQUIY}pGn^HEc$_e-SxU_ zBXm9D_0#fM9>!Y4j+edES8UUy;W7djI2iS2Q|=T!t()Y z)z+IBj#>@$t~B~1!yOK*y1#tjnNQzC`3^cE(s6&1M8nk(uhqOsw}C8JT`eJDjp@`B zynWA@YEgydtqt#XxJY4j(Q%bM2knsw{m{WTUiEPCOvKBQw!7d>sHyw%LOFK+vHe*r zu~^^Ee|_?*KUS~lAopAuqv1?asuzG^vHAPZVB$ltb z+*%Z}K*D#$N|3WF;2s`#?JYYhz>Gk=^tQDesP~5R&yJx+EPu4yb)&-qd%h~&_axe5 zz7@@yu9jGT%sAtwR=zhHe9-#!1GYpE=AuVlbtwaRWhTKHfAe6dUUX*pXgLTA-Yc;q zsu3pj&s~hWB}x^dOY97@U7>EX%()}|${cE)q)&g6=mo{L@}BB{ae&0ycKP>J%N`YY zg*wHwAwo@yuN$mZe~;n7MXm~Im=QDbV)T=(KU32WKCC{+W{j=R`!k2zXX5*vYpATI zJ^U17DW`vJt>u8whFRFR4VAp1WjNNZOZ{yu*ZRuJl63ET`gp}Jb?m*Vh|sMl!17z3 zjB}NRMAYmM;fC`*nBKrjE~*j)aL%OaZzw$+z4BD-+8IqEdHFMUbhZh`KRX`su>~RK zY3=w|H@N%dKl!Yw?tkL!vzV`HPP@}dXRJ>)zYKMOPZ;_4cdNw1c=}Ao_3lLYq;jP< z!6g#<6U^wEB9h_gxo0XjB`+bkuQ={^i~BI*{$RLo4eqCa>wM$-v$!rIuGfX@$l&@0 zIPVtcgW)^{96yWWx^TP!-Uq|$vv}PA&u8&CJ4)BX7OOn~;~p+l^Yx3jZ2))nt~gFL z;MvV-!^NrA$q>9jdBvvYcVzki?~iS2DkAG=){-_r-N9Jj->;{^dr{}jM_L2mSa(Mb zbuAq*=_IGQ9VtUx&+bgmbH{W;Qj+KPZN`HKMn`_t&E7$jSUK!w`L2LWHS^aoFbwQz zZW*xpejB-3Zkp6ai3SBy`CIn(UIgYHHLpriQjziFx3i>6g2A2ZZJVn)x6xr@*yHC~ z*{Dag%8_!wp$h9C2F^c;L@Ov>s$-59U~+Ip z*rtIScr`0qn(pQWdxR?MBG(PkAAEGj2j6(2s)<2!j`VskWj3$AW84WycNttEs<=Y! z#~+z;ZNrh&g^2c83^%05zjuE1X&PcU_wdu#LJ_z~{88z5lNI=_`wh+;D}y8h?Y>ha z1(?41gUKk^0U3X_Rc$Ik6M2xw%acN~Mn2>X@J|6F$Q1Q1>adYK$S`L;n$&L$c>Y|c zwz(O@8^=}a(sl)-f?mO^;+PH~*A}DgOssy$lv>;QTTzavaJ=uqCzAVMrVD*gm#sW( z8s&&~7#%0M{!~-gX-NcGdj($%_bOrfVeYoi#)P59dsdy#)xuExl%A}W&|8vg#?z2z zib7zxa4`4H2~Ak{B$#t0I3C92=(*`xL&kA78Dc>o@c zx9wUN<$>sX-$|rBO%daT_3sI;E~ENJZ-vX z0P<_YO{h2s5x6IA@96myPE@UCGykwhli7C(=f~?ITP_~ipL@mx&ISuVU&$APGpg6u z`uFVyZm-EBzZj08y`obobJ`ZDz4MEk_q)$QdMP%&?OFP;&seB`)<6V^P0P7cFC9XX zuk4$-Dnv$O688rj((DA&rw$9{+;xX<1bcX%Iv)TTDRA3%Efdsr$q8YidZ0xIPp@}A zJBJh`8B#68O+gC>*9E#3aR5)ejP@;~10K^xub+$(U{c|WM{eV)Xw%1l5BoBS=$^li z1v58%5k!sjN-m}X*1X>?eCw7S&^RW?BIG68;Njk8<$K!9Ftxn++d z2_QFFCG-Fq0_{_`9pBDO1vXn;>Km<|f=4ksrmW)HK@4*?nZwZ&zIC+LVPn38sPUM% zyNkMmx`nfhnz1io@{xi2eR|PIY4dC>UH1b}*Rxfx>SI6D4jqx3>qtdPI~4Y-ZMFfG zXtvxjjv#c3aXs|;3rv^jmP*2l2N|&JzT_EQP=w(inG<71?t!VyciVU$q@nJ7t>sIV zzKD|jSkqi^7i28a+<$51E&QRSxML>r0~mI6kqa%+alDOKsI>)DYgG_XGT6YEbR| zo8@mmxPV26<7h>>9606j4N>aUMc;}h{qp8kL#6Yo4DDnb;pJA3Q2%UWP_1vlBh;(` zhB6gOA{2SiEIvhRFF8)cxUxr``cVp*DHaKx6IBD_09J?6qG*Lv~aSwcZ!I3`ASl)Xo)HW_w2E&<2*055Z#AtQxUM+TUs-~(HnZp+X=Y|Wq}ROBR_H@0-?52V@6)6Ei|pX#*o+; z2ZlaO6-k@AL$`+}TIQcKkRK=GbHB29lzl`dA~n_n>ucIIoIGR#x%TVopH;EPCp+TV z{gi@HD@{+)WP2crFOE?Wu(86278@$u_08bPaF;_$T{gI4YDM%}v@E*3^sGRV+z$Ed zK$+bKvUvSMK?gCRjM^snibdzl!Cad*V>Qq``eN#n z-W{yQv2S#~ju#r~DaY5QD?$yXT%&i)ir9^Re_`XG3)&yJe)vzP1KP0Q^}7-10(uMW zf^*)OpbAi!z*0erZQ}!93U2aZmjYrXC-_n%hiChJ{3juJ==aS2AS(gv<#F=AIjeMS z%U0ft1=mkTvvY1wiD-tJLMfT^P$+HGYkg7-+x>RdtQZJ|Pug5OFOM|9`e$C_y#yc9 zBlYrp1fxC@etgR2i^9LGxZ1{>%Y5*~kChSyLjMKZARjO7XC)ltAmJ*&t`5&17VQ$$ zH30>jb{yQio%py-_tzy7UAX@dM{-ZT3t;*?5IU4Rh;xVDq--tfBPKM=qB<@EUPi=g zHZUn+?acBw`}8^RAJ=mh?}$&pix+R4$luS67VXNoK8oK(LN-=EF8ngV$5h$Qcc{r> z=61OUr3@F~m%@KRDIg+p57a{-E_a&b^iC~d$=OeR*efXPA zOT!GW2$FgtpKdA)(6xI%59O`dgVjt;ioEs;Y;$s(n_pW8p5fG`AS3*IR%MyS^BeT| zJABuel4y?Xd0v7)H)Vm*FL^ITMRVXZ$Ce`@po74J#D4BSlDKK);#tvVKD;dWqi(2j zA19{k_f!M06IggFdlRu5Jt~?@kXUcm)FsGxZ*Y)h5VuXyRVOuW9^5 zh;wDKAqEHqX)TXacEJ=yR)%FkR}>a-^rcwFU7#3&{ZD=;8BF$hBw#%16GZFO>qO5KK8V>2=Oz_a#N7BDPs;!TI8fJ2>FBKoLSSE zPYL(d4pOyzoK`~uJTY=DuX=H?Z%y>qLoLuk_3@Jz4x?~Hw1l*=Y8;!&u_`_+k%tTo z-ydr|xdoqBop_RRB_1l?POB?ksDvLbz3&H;qHz3K+dcJn0G;XlI(?Ss91I8<;`T5-A1}!<;^Lciz0?jPZ{gJ``Df= z!|eTt0+jy1@oi{G41VC|6edokL+in#lVyR_;B{LtduH%uz<<6!VV&SpTB)sMN(m}r zvAn^ZI!;dL;u*%*!FdJl?X!Dxc8vG#x8loWqWp5jL}qFvqS*8Wb<>NVP(XyG-Js|r z?9Z5UnT>r3eR%eFX#TLp^jncy!g_JYIc%#b?7J)e;`QYz`9mvwj6Ea7lRp(DoG;{h zB=mhkuypWS`D8j^F>pCGW1W#I^h|jAg6n? zqL?Nvg0hQD7}NOB$yoK-17U_8cG)xs^y?3^hvcp!>UVSH2pKShn)w#*!e|w-sL^15 z*pw0uOCKQ?%~OLsEQNAMErh_VW#I{u?Hi~OpCL~W(nsGlCayhCw1h{#ejgeoR>n>3 zyW%B|cM18VD_4hSv?0^Xhr_pH4ZyQqM`!g%q9Ef&cAGl~HySu+_(Ypg7c36ZeX`NJ z4bKFOybUhB1qZlKZ>QE@!rJ*=QMDH&aEghpZhf`{?geZ zch5TDase-1?ba)}vEp^N<{fP`$Cl<7ua}0C`e&cOU?QytKfYv0)9KEx;$GKibhz{0@X;pNIe$n#Esl(&N)p1Q4<#lU?MhkUKp zqd9vX^)6<&6LYwLL7DXk*PkPVJ8u-Xc>l;iZVO{Yvmh>fA%k0;I)fGLL`@cKJKq8i z|AvPCc9TPL6|*Vl{;J`FGWmr+iBI8xu%C*SzB?T>9rM&W3|k^|ln+AOxv2mmb19Ld znG=?5_u8H_a6*SqJin*?qXARXKfXTQDuR>Z9*z_$QNzV{_TBrkRIt!4RZzl=;NzK4 zD|!m$*-2`PWScW?vupch{2Hx9p2Z zfnS%YFweIRN=rpW;;IbDNJ6RlLv;XVgqA#_(4TIVnl^KMuvg7K7{a0M0g^-JvM)pN^2qL08go=5bU}uWk+e9}%KU8rEaUXQ>pV2%I9KDje7JX^abdq|&!cR`@lFBYJ6s82JSG8A6mu}!!B{K&r z2=}1(bc4Npc5lw==CoL@mG2j=Yy4HP`0OW)Iu-gU&-g2>zDVVgd*&_7dZIATo|lWG zUPRGyzV1M&i+4E<2MREGdxFlx=oCzOiEEfkqYZTiJt@3!I3KSmv?xwkhGM4a*HP}w z4QP^+#Y{IT70;T;YwC%*VCA+T29c|UXkp-M|IVjay#I*&59a)6#7uE_=d)G{Y9#5} zIU`(zi-^8FqJHZDqw=HeV~Aq#Nc0WCS6lI@o^J%> zAas{InIU{L30c{9TTWOMqTt>2HxW7BFzVKd?TmscmUG;9ckQkX?!39r+O^Ui(iUX? zMguni$@amXJVSG|&i5n>*90PU(?|`|&`>z5@uzZy*#_4RbZ_)Wx?-2Vw^)YtJz>b; zC#%H$(cnf@?|Irre-Pq&hn#ye2p!I}zs@ik4#>>~55EvMg0ryG!t8T_};{#>&7Z_cXkT%S3*b{95L``Z?~`ojAx z9eqvG4OdWN@a)#;D|@JqQxif1DiiBn<8xq*ygyr9Mzl>t`Hwl-9D;@ z3@W{tAR#~XCR2GqgvkU;mm~yg2?Zjmyx*vDFKY zJjas_vfSHQ&R{KL3GV#RL5eRPit+Qf!W&X=0tGX~A!W5(otzdQa{iW_%aO`BwqcHS~v+M zJCii`jk&{pDr6%YKDY7huusnCRqlh$Q~Rj7&UpcHzKa7cn;s}+DLS4r*#(PPH5{%W z62;sbjXV~yXYt5pxXWn{Iw*a(_Qz1KEN~FLT|5>d2A03fTEFy8L;&<&>+%i{3aw3!-dyG8&{*q1*X8!ypaOs^dxaQ%RTu{q*bYz=6}SCGiV|`xXuEd@j3rM86So({Bb> z`cFXAeSV-pX9kkIR4zGi<|)i#ndd*T{Sd$Npf_{AAA(5F=uEMG&w`UeX&;#@X7JI& zHi1!rStym3*wH(@6Z-hj@W^bw#_S5`^z^)nki`2P z9-jGPdo3vD7APMQ?(^!B#eW3uDZEXT!xk_3lZgdYK%;YTk)enu7V;G^m1oa`m2_Ug zZQIJID0A&KiBAJ|a6S1vlF1ajVSLiwXRC~UovQVc7t)7?^*Ycv{SNL;45QmrQ^2vk zqdCjUGJupkZ{F~P2G|l`@?IH@gZ#I+q}p1d@p0x!x%n#!a45NW(s!8wolG?OE9kJ+ zW^-3C)9buAifVh{^X9f1rs-!j{Po!ZRz-xZ4&2tpCrYOy$gfG`J$-ghH`v=}_vUQV zYLFL8-!|ls$lwlCD;wd3(?(J}-- zCU4^i<;@{fzFH`A)utLZz6|~r6d8y$my@m?#7+-i8N0s?1uk06(Wgw~(cfY^I(NAW9ORcyCi~t1(y!QRiQbIC`_uz0 z%C?hGcrdFE@&0nOgyk*r%Zl;olrp?c$Thu_qrCR$h#mB5UGyVfibl#*IW^b0^3X+V zcH1tB7@TQw=16T$JT7JIyDaxH4t5-^l9d?^MdPNVqcbsuNPsc#{rty9MDUj)-xK<> z?Qcr(MG|uDJki&p=0pSVkszOEf*!$eB+akQ_?#o4e7c|cJ(DN;X#K+XVp9To&bSzL zyColT)iaYrnIIfAA9XH`Ee>x_6kL!X=-)HHWO+U@<^q*+rS+wq2rxhRtop=uD&pZw zV5%w01|d5BZ&?a_pbKyKCo)w}_}<*~>eiSx9%qagyu{>*6+@PDx1cAy_Mz`TeKzL$ z)BkjX=PL{U?X&b*e*ewc=k%pF(8 zt~vM83%phnFTMq>@d~4;??h%QT79<__`&EkUN4ug4DgQx<s9%|ffB`1(b@SO{5^9yAq z9NPV<{nuetL}42c5J_7K4g^(SqFWR&I$MjG1ApZ z#sRI}I8M&auDm`6B~;1d@-8gFxYGrX&fi;ra*W%S+7$ix;V+7} z*X-jE=iBdkWQtV~9TE$?#Qq8ci>Id)DvJ=kva4p*l|IJ)ki8`@;+$`Ms+9YZkx@%vaW|ewu=z z&CeMk35huI{;~J^k(IFgMC!Y+UYy`{Yv$Ot?)!mTvtf79ww?3v{WH70>_1}*B=Pc2jxFlUZ;2$!n`SKR-jDieTnDw zv@x}XT&<53Ngy-Y$hkyYI4pbAs7E{>+IBFs zz=Vqb{qw(T~F$fT5o^6dtYnd*4sGrs#lzNziJISxlbj23glz1r+hccvJx=!%DBa8*%}bBA|@5{m+@^Oe}29A3^7u3v=aUw(d+Npb%f8dJ3Tm z^z4-G`**@ENX2f~RQhciY!o+QS61-D4)K51OD=}urZZB)p%sy^H#;|IImr#=?c{x5 z!zYk((sd&8%4GE9d6EYmBJ?FbYt2oRjKR`l@q@13X*ksPELRiZes|M(p=-tq*&xHB zIGg%&ETFDoZ^}g3h^x6dRku1HSWt;;Eel6NqmDUNn*BkrLFB^LNW3v_49}S_&k4k? z!-ve&7XqQOz4w1U*rbx%fBNirM&^IIL4R}WfBWp{G$~3v(AVG(vx!OEa2@AeD1amIN@5`l5I zIob3|FcSJ*^q9jX4U({2dR^tD23u>UZRd34ae%aqf_yMR5B5#4l%Tl^p7|TkI#f18 zK4QXk5J!8+&g1Z-0;=I>EVpmp>mcL`ny1CAdGbL0ANxpa(OI~x6N7Z_`oMQRHy(v( z$YLYiF{k!)UtsGzdtod*67(PXP-%EA22njD`ZIMu2_J4J)k~fe#d4`X+?IRT;o^;J zBw8}8c*v_lXu8}Od^*V8%X-lQ4Ml$}EU(W)_xyG>=A!!uJz?BGwSL*-KLe%5wU)?n zlMur_YgSem1a^)(hNXeOfrBYe8s$(b(4MQ2}xWrNTb~X5h_u^6~cse$<1g zULyNo8GL1dz96r9eD2is#q>9ckT>^kk9CJUvZ)mC;7S)k;e!;;BwM#oy#J>U-fv|g z`BL;Pv#-g}BZQ0pgJ&$(miv>g!KV5@-eAuM+tX+Fbc4Npc5luS_qZF->3x7r69Xr1 z!Cf#}udyGPYSF1SZI`w!ibA}F_X zkvoaE0L0q;DYP%Efd&Uw&+#v2LP|@viA!Fk!1Ghyf!wqv7^*F87@iUb3fAqetb5pk zoF24wqc$5;UvBiw8oQ5aB+Q0?iJM|76;~Is?P%m>!}oVT?E_@R^}ak`ISRAP81Cdn zB;X^G9$qvGiIDu2&+AdIFz}2k-_ye{57}7E{OZ%LgTwEl`m1xYaoZIOiP3Wf@cYYC zC)#F%&=ZFV={43o^n;k;?iYau#LewD?k$%J%N?h=i@ZFs!Sf#+=V*ekk8Vs@qhZPK`{PLH=sAM{BNH<%clR| zoW0u_vNeMDsjwCUdtLliZFx`Rt4n-dDBuoUK+1kXWi~ z3%fnW>P$UD!^sc4%U)l_C^razH;@}J}ku>6ym^FX}l9AP$n|VMq&R>1G`z5}v z73Z?jl>qeT>nbpD1PZBF$lZDo0v9!*aBjOWtV~!gydtlIhj%B}g}mK>2PHjQk)#jW z*lA08e5($o1RjX~6Z}m?9Gsx?)lrA2|0k{Sv^QXqqd%9P8ie?1?|A8FKZTkyrRvUh zCYVhG?x+6l1l4c6W1t)h204TrY>}lQxMkmJxBkT%vCph)8}Y_rVImsOSpyYF?`CqM z=!70LVD+Y=mN&q!-B+!2g9yE5=*ohnl|RZ->4~x;{N6uzN_o0nJF(ZoZs6qzeQ4vk zYQ>>y2?aRQB%=s-rVJX}Cny^e30w&yZ}z$vnq4jXcw;mO8J`;$cQZ{SBoBP1RSi<1 z-<_Ij&rMBuqKC4n^_?0n59ZHtd7*$Cy*h6lJgf!D6E*V|Pd|Z8UIrF45%W;{z3!A? z=W7^1CYNhCun4_;%T8J(_hSD?f?MX70>ICQ#UBqeR>2;%%ljv5XRttMk=`r$WZ=%^ z|4Tfn3&u5=t9c46Vae`9%WbnFkaAakk;G#J#hs=^Vq&)XoHsG8vk6C=)&!NJ}`%61WV)>oKE(8 z3VBcR-Z=~;!Ms?v36FR%@VmswWH{l1s;{4|U=y#!Hhy>Y4cx4lazDjo;UpDTCoFQ2X>JJ0K_GO;4v@DPD z_yX54cV9Q)<}!YTroa$w*9mYj?bm{<{$An*c^P>65kKi4uQ(_{Bj>X}^g3AD-fTI+ zW{PG$|9Fv9kqXmhY8KqCBmC@?*~KIRH?Ch$Mo+SAiniE=rS`Af2Siap*X_22LF1E8 z-xGAAAukaAee`A=Tna0;EtJZ`ryjQqzUGSiAJ?_V8|?XDd;092Zm_q{?#(Ee2LO1&Tp>4LHkC6McCR-k2eGkizGF?z{L#XP@t}4~NMA)4;rmCJqcy5MI z=2CGqqVoLVbfK{TIPf5ew%B_TmR>u&G~8Z=){m2(AUkaZJIYO%IBlZQ&CePcf0YbS z-B-8%Gb+ybtf59di&rM5FC>mowIlTMP#kfU4vhu3cBzjV4Mw6|QVlmZp9BJbBHka8 zU4*;U2eGnM30yDC%l#!ON%Zx$lYV&mfL`C%C=ZCF z$LR`7?jv1_ThltNF=(iIr@!}-KYI86gjSMd0t{IFRz%$pj{}OMUY|}&#_!7I#EzcL zfNQoSW*-T@QyH0#2YqxoihC;;<}{p-7-=FJKh~xLr7wA}X{!z3-A%{t_l6OW{8@g| zAro7?<{MFDL>Yt)susyiszYGN>M2VC*9B$na=aq&2Jmg-R0F{WgARA?!|l$gnbrX;7Oh$%vqe$vV5Sg)DXKdm4Q-%UXAaVv`jz`+`*hgvoahB-6@YT z$oA(Tpw_UpCFz2*zP8NNIn}}AN;fH#LkiIdU9uBn!Yv5$cu0;ZBY2|s@(YRZLbOgA z!QB3)3f17gByTk9%v`iOf@1r2g9JkY~7ATqgY3^Nx7(~i?hE)GtPlxCJ zzY1?{y$E|=y)k0_)uBM6h$l%oG3xn!#w1h~)!ZRznxxPa~?UvG+6Sm+?{foQ&&w|kf8RY8oAqMa?-+OYv z!4l}#K-Y!ix3N_9Y`bScEPi$A$?a}gL8#2TVHu)#4L2(OSZ+|i0wqd*`XmjH!XQXy z*>Q9V-XYQ9za2FMUHD@1>((0}ZJ(G<#rGb3gvZ@+Ou`GDerfh}?m;6oGJPAv*EWI! z4*xP&t&2jsG^dG|dkA_!H)-03CQDrX!}#7Y;vz(L)oX&Jx*1YSY)LPiGY6@9G3V-| zO%dsiwaW2SLLanxM8+-mcC7pS_mWU=8zlEVx|VO94;UWaeLUlkfE;d&397{8!3!&+ zHtQ7Kn6p9Q*sIDsSW3QL_|!EF`Ni?$Q;Xi9tNYVVT4leZszs>9}-``kXKI6_73o%>>gqOkc7pF{_iXDZX=u!V3d9o{`{>6uQ5S zuS-NJVtz@l1m@_wa?UoeTwp_vQZL!Bk}pPG~`r zx?*=N40WBSOW4-;2AW56b?GRRFo@UuRk&gYw_z(?WK5lp;Qtc0E`qLI5 zMB-H1t}QS4o3Fs5{nG&sXuuvj={!6(>uBy0}H^^3=gdOkkvwK|E9&fPcgYD_F zd%D5iKD#$(DfgAhB?f$f#v%F3G#`giIrFRi3KPxf)b#B}1L}1sx4+iBf@B!|5Q~30 zgp!e&v+OyJZ%eR5E=BO`NH?-lkZk)ETnJ{3I~6;upF)|B!F>fK#ps~w1ICFIYrN6R zeRI(@28S^UayHOpfwLl70yKp|NZrf#+{&9$Q2nSofAGF1yePA~Z_+y!AMvE2do3IZ zp7ejAu!)TXjh)BzHvKC>cjs}r7J`0I`pcR@RwE9x@4Ir?)iVvWW=@9Z9xnlSlcxN0 zm?5|@islx2?_#mw;ne{_6&y(?$p8Ik44$}oz+&=3B6!j}BjMW_548;R4DRabp+*YV?Nn^if0W7oT)>`e(U8D%|1rsE7W zKB*5A4s-+4pgu+hHwS3)RknD7pkq59{PuunS{|#`+(@LNw}Tt2I_d;|3^b`21f|)h zBQtnp!8|@2Fz)&LLb^B;^ytWL84sg_P z=W|6(9lNI`Mw1YefGFz~vto3>dgA1!Kmppbt3YnI zht`j7+MwKn3luOY2$F8Dzw#9P?`oNA0lZm*PX_g5vMjK5tB zxr94r^DaqDX98tlkn{{b|27B9PitAOiB+Pbp5zWq7WrsHIf}Aw)*rEUTbsTh*N0b? zuOGIK>%~6xGxq*DeaIyD0|5vsMD?uE5JMfr_-P$CFVbT*J$9X!eU$J^+18@+h#zzb zQ)n6|wwNX6e3s~}(YRR(+9NwEjR}m1JbA zr6=K0P10}`Lqb2p{c*O@-8e)zsQTWl#Q}7`@N=B{=>wEM`L(~&j6fRf-~RenIipY{ z-N9&m5AaH74c-v*M(AyEY0Xj~IFY|m`}avcI5*xL*E^5|3W5dHh6(TIWIF2}3YHn@ z*ToWgyQT`jBx0!Kz%m5O3ZFmTFmFIH$66-Hm`Z`0sabub_9RRh`Fp+iR6SZ*Yk7H= zv>iX|dOa(zSOja;t9TL*1|c7N4ii?VTF|)7suX(M8@;Ji`FJK{8W+v8+^J z+BGcWiGq+sj`$}hzK3W?&O4RLAQV^`^U_cPJ7hj2Yj@;)0!q7fal`y}3i{*!rre9g zA1t#HO)JwyBJwu&bT~-J%XN;2_&7VErQ~B$WI;JdW2;>%7K#Sohe`mc=7Ud0y`(ykEKm?4v;k3X|kgn{%W z{zEs}!@%o|A7r%-MmS9VZpb1E!TnTjvnjlW$dkmUP}egR5hcpF(lI9?Mtj9#lN}3e zBC%?JBK{nHqFJIOF=GWy@)G_|mgayyL*f^{lYEGB^yq^`vLw`5R18J}Qqf*cVUM5P z4fgigy*W!F&HH`z>I%G+>qx9wm;nFu)yMz5G>l$en}Dae-#~5t zn@{Jgyx?-8=P>hF4_Xmp2e&CN1U508>~k>K%cxR#Xy)h>_2W)&?1J!JH%6zj&vB@6Da8MYPa$GCY&lTZT?-#i=^t6Iae;bX z&eRvKhu}{r0!2;a_ z7A=tuSZ-+jWYm}Y_{paec5DUqaDK*yX~9JoyPu8tms7x(7w4=A{4A#V=J1c}g2uJw z|9AtKGfni*2ZLD^|LL>vjCS-t-2j>mFaNjCeye*_Bs@aUSu_$uEDj>95+`Lp!s~`X z*kIPlS0(60u|J!i#UOr5m)e~XsgJFST@1}XRH5(MBcGj2hjD>>t(W;P2rsZ%^cQMY zBD+^FWA8T9Vsf3#v8Ox%AeyWHrfRM!I3?|4jz>~p*4p4}87nkF0-3C|^JZC+B%M#u>%2E?ruZG)*gIuRR8lcMKcyU>`5=fn9 z^E=_$4Y?Fd?g+dZ==fqv^~==G2sf1x`O8aNp~|%j%QjAdC~AMjyUvfMpkn@)e7jaW z=Hemh(qpnD?k+JUrx*8Bk>9&ynyV_T3loscUtRgJtI6;L3w+F6 z*$LnO2r{|IP=&sD-P?Nkz5%k?JmWJNY=NT%Zzx%b8jl>!+iaRsfC|+Eh#@-vaGq5BRV2#^U~ ztpXjf7O_zKAwPR0M0H3b^<)q}vrpPD{wxbNi=WT99}|QoH{W@w$`{~wE( zFTK(RO{wVSmL8lNdW2;!8>oMNZjX-dXzJ|?GX~Gj7_WWW3WWa3LL&Q*hhPE47IZSo z22*$md?oXYLx)?e%s!k80dckzoWbrzFl%N6xpmgyMvn?+@;?@EH1RRpZ3!1N@2WT3 zROk*mjt2f%C-ji;XCyrAh&F<0=crFVsUvWg)5!RjWHjCu93U|kw8wjSu)UnZ9zVOs zb?xy6dp_8nKKno2;Q#Nldvlhtq^0HU)sN8Zna?93l4NlN=EmQc}>I%G? zKR?3DT#cff;zq^C?C_isqx5pz3+Pg%c71ER6pahM?JFX6#BchAPRv|&0SO;w)bU5QbA6c~!*g+e>GWS2 z`3ig_xj@s}z78%+{JsCjHXgi^HE`xC4nr$UkH+5w7GkAl-QA$-N}MtBpox!=$4OW_ ztgX#!g&gZL7M`BVM_+F&BsB%4Eci| z6~j;Ovsqi+T)|p-`ar044Z29mpw{S8f)wpJLi{W9fWErB?5QbhsQHIf?>TK6EI!R4 z9zwWF5L!*ETBa3;AEj{prYlQ;*ABkGggh8F6MOK5kW;_{E^b!|{4Df*VIo7|y5L8h z8~y~|00yWH3=@1XDCz2XilEOzuIJ!C-C$BAdHTWS9Hc6A+wR|-y-c`P5oJ&eo0fg` zje3eua~AI?{}>@R+wo#y#S6i@jAz6|G7nG{xpz$MoD*IabsdiE>xHc@oPy5RiqWbQ z!%LCvc>E;PJ^ty418Nm7q0Y$BL;RmD+}zRxpkKSv%2;7A{#+BYk2CW;v`hUxk@odI zXjz0#`R}|4b~p3_3dzwgT(bW=r48gi*TIsr&`KSy57wml?@ za8i|d*#^g@O_=Ooo8f~6TS}qC=imbEyrI6oJD8+nG+{*1Xj0oL`O}|7Kq9$Mz=7Ki zoNb1kZf_Lv2><4l1KeqNth`abMu5O^E)?%)c*c$c%734}OhN)3IyF9Bs-Gt4*2UJt zbc=-kw-ViwS5M(`hrf1BdMAu^;tJ|=9>V*%==Nmfs^U(d{8t1Pn3sys}qIe0;^N9jBN>MsAXh4*s&8vi`zO|`vsy{uXm4yZ#p6+ zY0KC4+clPWWmbS?t|j(Q zD7L~=Zxc%ttV{3~aSX3zI0v?Z`_QAcKs?iOXWgf&03WQ37t!V{z{5lX$wRI8@HFcY zRNbbAWLOjKz0tOUYnIpSJ{)(!svYN>+5Sc$H?jH?mjmNqmMdS9SBWzgyBS(8tdN1| zuYCrxO!bi0yvz5}H9w@1!bo}bt0=rayVC0!9&a; z9!(yyh$KEf7QtTY>p9z3P#*HLo^wRIbwJ(ce|Gd+si5J@w0ne*B)=M z=Y#F(vwOP1-aflGXB$Ny-I}WZ1jlIx`8#MIV`fly;!QjLbWoW<<52Li?jn+{BU5yGwL>MG^L^`k}$%8 z4DZ!lUPj_*%a5oKCW9weZB5P)-bedIDIfs8=L!8P)7;1?MXr9>YB%*{=2u^mYO%e4hUKSO=sAE%!F zl=sef|4YT$&M)`yx_U!*$i6^$C44R!H=2TD0yp%=Tr$wSgoigNNdZWib;e6;7C>Z) zCA+LJ6#Vv|FXO#zg_iOUt!mbqAdAVS$NYm9AZc>HK>B3|5Mv<`dnU^O_{5txoTi9F zis= zavnBjK5)o=ABr;*SsyA3BqN~Td)LP^1F63D%%P+7K;=%?1+JZk(B`9m5>tK|{;){y za&nCD-m78$(VFnkx>cJj*%5U2lkXRveYMkpD@BZ}cS?rv*DhbA%shlNTvlZZcPr5y z2VGZAWPp}#^^1mWox-9-8HZyRPC|8q%)VH!Xt=~CYWeqR2H24fE~I~;1YD9y*G5hT zgE4!$t7kY}L7q2JE_>!FJU?{TiVO;43Eh*k?OGU$9H3z@Uv+n+*}STpgP??1ypDQ&}QbcN!S-_D&Ec@QtEaVdGm+5{jbn z8~3d80!&tA&y)yyQokju6Qx}oI-ocwykhEw1_z9~lZ+C;sbk@9!ndPAj{oF|<(v|b zLuU0*O#35EAPaTYy>SS7SO~Otjwt5&j6mmZ4mbL*KZ^{gMMho3^b{|;un0I``Iw7vkHRN+uE3d)7ZRK2 z{dThs5S16S--x)6KDiG+lNFf7jnu=#<#OHd!1m2cAIp=$GoHlxpUi`R;_b0k{>Vlo zLQ7isQ*;y>cHBK%yG-yE2d$na9nZrVe5(vM4wix>3r2>Utx;&7@S_iTvJVkAWznDY zc|VZjx-}~$=Lt*?m#JM=zl(OFZ>I@EH56k_u5_@x5Rgu-9NXj$L?48d-0Vlh@nCL- z$**HR$o(wCf%V7~q(6B-F+R-@c07bvFFB{9BO$yDQbXk^WbLwb(Rd!*Ss}7di7`h; zoT$V05+|;v>f25`qmSne1CCzFb3>)?t?hMm`H=LN;zi-(W9yyO#&s%PH*fvwK|E9&fPcgYD_F zd%D5iKD#$(`H!{w6j`l8^$%+Yjq}~GjIMcemO&rltuGYn3w;Ai!nsA~>fNz~(d~2e zrye0c{?_L?Wm9ndi|Jd=&3vT(`}}FqhxzEjGqn%qCJ9je!0=m-C&qXsfjNew(Gyci zez&SVoPhe8X`fFx)}gkC1SO?T#tx{((qTRnAp6T$! zp*2?p({-;d81k_32;FKdBvz98?p~OLVV4?s;GGp_|3>&a;EsUO9&&wpj6v8@Vl(V8O)b(`{2t3P??`z5RQz?S-4Myl zlG#pP2I_74%M0JOZa(`UG^KmO_jkq$0lp8BjRm4)WZ;y6>) zrf|4;677pM8O+5I!Jn2@fc~7cWNlg{;MBWc(8s25kk0v;P1!>o*k$m4suaWl$zT4~l`Hc=Veww&_|bO-?FN~QnqtYbKmDX(;|!47ITkA9KfzK_=( zgx_ELVTbSlOY>{jazcN32F(Y@$54#?QHAHO4rC?Ic}T}*jkD?K=jwL!(NpJ9j-v+? z(1=lSrV3pUxFwq~9(^bWFQqk<$O2(FS-UN{oOTsU7CV-$#Qf?=RZcdQ^0h=h4|v}p zTMv{f+B9b3UxZSH$tmc+bmF6nZhe1lSwL;+szjFp1$_O>eLN#{Fd03}6*vB=BO;HZ?*&7{Es7-iWcsJL+llbdxbuYZunnj-zZvgzV5 ztzNsOy_g2izFsAy{o1fl`^h$Xk{(=nOeKU@w;6|Q%KllE$;YipKUB7bo8cpebTtN* z6vXY6#>aiX0J?pN9*kJ(h25_P6Tg40MN55QPJ+YjklgT(fa{rdNNM~$yhig8`l25P zFBp!(XmX+0*%w=iiWEnzpJ&%$wsIJ@L7q<+tuh_h*Jc z;M<(ug}*7-FY|2S&Yd1YPA9sp)2;@R%p9(k603*OgKS-7LUsfmP-N~}cnTc92{+!S zHbRmfJ3q&`dML1%NV=JM2S58BbWt>$4wWt~tMsTQ;u+m`zIx3>DEXCQDV*v)c;Zyg z-_*tcmXZh7hW&FdkH^6B zFX(dJRZKrM7Ok~x|GoE>25U?Jzg)$jt9PD*lfJ62D8ICcC` z3cLX5z06DGz&X<@rv*YE{-P5hvJ$F|cw^}soPT-2XBH$`V}v^>VY8>d%$pMO2pLCn zjtCp$y&jppeuKT-?Oq=2e>sK!&(H30U30?;Y4iCP%L2#k|D3;cIdcSF%a+A*I^W^2Bi1B`^M;*i&H$_%~6aO z;qT}Bw;xv3!ug~dZegX~z+@z{pWL)TCmfmeC=ySe4}iK6?>Lef!?D2N$)jp4p;&XvI&0`s1mrwC z%=U4`9^5Lp8m-cif;bAyCrG-i(YA8%af87uxI*{bxZs-~KA=$+l6SrUd-$h1`O}xc zs2isvOj~aw9_Ezg{N6I8^_4UKDp3xqa*K-I%})dU2V#2d|C+;)^{8TvD`}AV4*BP8 z7dPxCqvH8|HyZ1=rCg^!8V`Lv{QmXG;ARc7fBgnH;YnE~A$JR{s8Z(%c`!J%X!I|q z0P8`5K7pTwt^JC<1g;By3$FZ+H+a8DyU5Xc551+o+WAkPotB&aPd8}$^)_lF!WgNN zW^(_Vvu0h)&$9-4q4tYF4m)6hF51}CbwqgL%C?$=dF2C;bU+w`21_K=MkTMrmWh+^ zIVc=!Fhr8~s2>3hYb1M}l+C#D4$Mlw>TKZ|hegHChRCey;tk%&dY=S)P@|UcdL~T` z^>6GuYV|}J3bo|$?*1X%(Rs!mDoE%7bD|s)I4l$Z68H|x9wyumt{zsH{+wkB4Gz1J z&L8f_Eb{fO>cV>1aeP(qDmFl0;+eE*%@6&-%Okn>-qfoWRDtDJmmJ>@t7Vi z`G9e8^Ek|ud6ZoE?Kw=@mpRaMX%w=iS4Ov|+=uc4)d6#3omk9j{c;q03Sf-Zym0<= z3#7g9bwx1eA(r;J*KmZp7<3peDrp|+ghS7|@^aPnv0(DY4q>?*pxOB9&yOqjAoKL9 z%gw)OsQLY_x}9|TplX7&n|Wsn-rI1Giup+kj=y}mL*Zi+#7 zp6Sx`%e7Td%8d}HdB5xMC-XFa{5{HjBp>$*dpt6DQibP&`+wk! znlawnWUf1ZUMS;@SnM73^PI~T3ossbudRY138u;grJwT5!0c9j*B6DoP}wf}rrEL* z=>L^X>%_z*tRmQTv@SOU`CWXF+R4!j&E)pSQe?RyhudjKs$8wW5#!n+Wlb;4`BzJa z=2tYF{Kjy~&es>`r7>)!ceO*5QRAWY)%HO8`qHF4D8rm(_`NhK3%10S*IbV@g$p6x z=67sJ=n?Dwe)$j;pvQPZ*P&Jb+&{rPoOa6w4y-4OAB~K{ob#!&91FrueGYA;XQ+{(oh1JPZPP+|H25`R0}E(w)$h2H&#rxBmN+@aPgS% zGk4VI#q;iPUluqZ3n*t5qG8w5i5j*-4{W$(tMcck18A1qQ}g_FD!90?HLXK6AAEVI zc187I8hO5RO8F9*0E_DRKHS}#1(YJrQmI@l2d)nm&L&0VBA2@oXrt5&e$->TvWJa) z2Q%GL|H;o6Gt$;@N4-o2f%<_1K|%>YY<}fY-6L;wZn4*Q*PAq8YWye5U_Bh}=-7x| zG{1^FigvxC`4$an2V0#uew6_J#_#F2&&ojE0VSD_h9uA}+H^`a)fP|>M2XAO`N0S8 zrEM2dNLV^bl{Mp`E-o!q({a;|gm-A3`af_nKo+~`!%lwAKo3&qLOKVt!H5o9qm!Bg z=;Iz+vHBbc#`?YbuTOg-hb>QhZ#H|PJM3EyYIb{q0+7yZAsP)_zQ0WDc$)?i#msyp zcBg=`XZrImRVu*Tcb%1h1L=q&cKu)8Aa(rOy@c*YWJAsE@%CF78s?Sww>P-D^|0)R z#RjxEx$>w*pb(%=jxVQ0x*;vE-kBn{TqL2UFf7q{6HI)VU|M?7183Y~u1n5UqBn`B zPs)jj;KI4B0bB>uFp|zz^-&q2f`~#+KH-pidgwcfAia0(aDn`+t+l$RVyzdkJ-i4++uB}D0 z)*|MyZ|juHVJLFwGNX9kE@&mwE!-Z$gGF`^CU8i^!JpDw#4Y#a0@4ANu6LulsJM4f z?&8HLfWAwfZO>2x;jHd05;CgrymjmSmEa3_*~FjuZ^LakHkJ^UlhXnUJUZrCo*1FY zA*YD~duw1J&oR$U8409oYsI=e74T(UlhPZjC!yBkH#mY;0y36;Uu}Jzfz@hv_|XiI z?^`_MW2f^Y!3N!0xqx~Q*fx94UphMui3S}hl^zR43_;Q>2SP)E(eT`#uI>52?w*#* z^^iuunEJIu5Visns*YUxjWiHD5X!RiPAkk2?417WoP`ctjJ`i{s~GH&xEJc%`UnQ? zykD#oTaQ>B5^NeJyn&)Y%7S#sC|ol8&;qRT&|FAI@Vf0?O!ZSh=TLen>=n;A!5){2 zm|u(8yNJ|+m-8o|F?;!5wWazFfF)6lHnP%BQG_iPD~%*3)T0rFaH%r%ZLJKCEngT-6K@Zd0>&)5kfsV2&Ni7>S4U6)U-ejSG#1En-GY!!0u=;Pm z))bVd1Zd}XheMvBeL>rr0rnS4ZW5eZ%w!I2^O21M_c=z|eG1tLF7 z(n8CR*N{oVO@4Vg1}FPg+6$*uB7fZXXmE_o8}jmT zW6vu>eUEkrKH@S5m+7M_bU3cTj`0&RZ>TQAj&o`hRdYUgXX-=qCq?0iy=Zp3$nO+n z%pQ38l1(ezlOa;8rk4QUO!-f1yh_G{L)NLB58}~%8sDssbp=4ihC5q<;W4D1)+~t% z%z(FE$lcnKlZn|wzegFY`y(z+o0Wy^D}Xg1(0WXv2-tBPe~|Xb3##uCy?;vaBvi?p z-9n0VfYt1~UPwziW0$ug0?g1IObvTWB_EFlJWJH#sgfkbcPvYC*{c=$Rj(X3$}z=C zZEt<`7vdpXph%-7Z6tQ^u+tyf*MRCoFCO}I-5>1uqS#ob8UVEUhgM%|rXUe#zYLz1 zBFOa2(~Rn`AEvB1{gX81h~GRszMo(EDy;e98Izyth|@RbHo1t9v(kN&*9F-YJ2p8o zFg`kLlWzb!j=bIM-NF(Fw#_~mnLqchrvQD*o`-GbXK%iJN1pP6O|K$zF)9Dg6 z?_i4t`4>w`N#O9c|KA%V1`7R~XDRnDQTRN{0Q%Q%k1*XG!`5%^U#zL~#iM-nP1jeh zg15Ve6;X1`>Rd8l3QJQKl8F0i| z70**Xfb&ms>sa?&;NFwUq@WWPuw%5xXGoC;jD0$5^zc?5>Py-?R&JgS^!2t@W3LQ6 zwQXK-*4he+NNre{>sZ35c)gn&RzU$#$BsyA7IkVpjYESQzCtNFDQK5v?YFyG zZTNhm_hsRvtI#UH$316!4rQurdXz(ipx zvhhy`Yg|2_Oh~zKj(NwFvzsKe@ogsoWpi9u>}b_;R0%h#spc|om_qaG(kj89@8abv z%^%vx`&E-fi9v^{CTu``;IE5d5>(2}HsE-96V^=M5p<7EMAXBd)}IQMLWh)6N2*<#cUp zTbDT~nS064(sUiJ3Cy_PUn;@xjNbG7(5%50JoI%Vc{c%_to-7Z-~bfz{5#8;_g#?B z?x~bM=)rA2ha%c4T5yqGWX}1W)d=A4^SUpR!Rx~CWQ z!H{i=f5;xBgFx;u`@>4OuK0s7hQ10GFt8tcS%jhVS{3!nWBpL7E6tThw+Ci!JLma@ z>;)N~72S9JqbryWaKHFoqzu>R(>!e@Yj6ds3bREn#sR+9G|>@k_wa;`Y2vx60eEVl zA&rDm!0PLP-oN>eaEr%;Uh@SfaQx3TW`%8>fbyr6JFQ_bY}_+B9D1)3hnJ3Wu1RD- z#=Rb;$A3hCz=)F0?nSca>Gj80G1`0>crVlL)@(1X81(n8e%uDDBc7Y375IWSp^eLb zArp9;%Q5@sv>BR-GLGZl$cMbf^1S`$N}*MU&>fLq*YRi3g9g`{Gydbx5?n-r*F|t- z2)+T)yCwQyL{EXp&l0&VB5y$4!HE4Vu{R**S>l})TKg&d*m?-kUOn$w;9dm=54k>h zeKG+z4E849*+>BWHHSXPUXB7;O1_@;KmG7fw0QeV?J(>cc=S$^cr2Nh)omtuun0)g zx{NqXdxNk?)VF1W)9_SF)t6l?>G)?p^vMn;Ijv5sZ=fx zzdio@#z%HT{BGidf&avHa^Ltq$Ax+=lq0%aIL#D^4vrmq@-)U19k*{h9eKnHJ`Nv} zoCBUXu)&tzhSnc53Gvv1p&&S=mb9}#&uO6JHwzx7|6d;`d~d~dUNi#dYLHv3?BX1~2xafj5W4+_*esOqTXERF!(#`5wJFY_v`5G-V6Iex~U@ z&nMk+wT-i&l%zEX<+?n-Um=v7BbYVOyx|1_YtupD*lgUmRFZBP5sT%29XYCR9tz$c zTW<`W&PJD%-m%br4~1e?ESiQJtvGOw;`RzxGG-fH{qxwy0yW2f73;`}Kxfg^QQ6sK zL>ahh=lI|<?PPOUe7=J7JYq#fP+7HzDc;NrG-V>#kTD(l=$_v#SzJa0V3 zfH4X%eDQF#j|zaX!xt~^(5rx?T8nB9w_t2DH&(X8H5Brn5WZlyEd;bNNl$9u3r2~< zc_Wpz$)M@|egg(Ff9KNtPvy`_18iq!#ro}D3>MCPrt#U*6i$!cc5*MXgiS9`e<%`g zzBkc#JB?_K(b@CNCew5vcHTr0BtIoo#=|JW(xv8J5_GL23Y z?EX{*DQ~CaR{h&JQ!hz8n6(0Y8M$3DVs;Da&jr-|k_v_O6*loE4%OgKYP|oKr5n)E zvtn19PYOEqjP#~Z#0$6`u)I8T-XFJ9@BJ(|SAkW3$q(HcD1zL=X(tO!DpBvj5>D;Q zncz+I)=cfs{oZ#)WDL>#^e zM>?LSVj~uL^Al1jcrN>P!I!i`_&s&1{bhA9E^ce+d8E^X4Uf=&c81h(m+O|%l=>fjnLRv^U8~(j^KNP?~ALxhG@p_$H0uECGIc@ zv!C6O2A6k~MRA!#;`$*A-u&!TFkolK%f9U@3Nm8&%=M-a&j@T(N|Af8uV;f8mT!}w zrr!HElWV@moQZ1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c%Y@Eb-2E@3_6! z_t<0jh-%9RxA#S$nVv>WdCmk<=W&BSzb$b7rz`1hjpiV_gv5R@-yPW=)(dK?3<540 z)M#Zf2yTc)u?tR{!<_ABucmUjBU34-?$eTCfVZ{Z{Sv)CIIF7Mr_vn_JtC$X7p@k< zKen2w%JCkcKBa%v?P&&3g{5+T{uZM73Q2QA2?J=Lc@9x}=;8Mcos9vP1}XK?Aq zn3b}Z!SIf%8=KxE4^$#g|L&J=5{l;EHf_@x1>`NewyAyb09sEjwdbzb1HSYEFS7(E zWGJO?vv<`F71>Rh-I#L(q_WTB7j%8Wg>UxS9jD^K*w-B|tDjv1EquJPj8AKTs*}Tu z`kR@k?$fz{Jp~Eo0Fv_UYE(FCy9EgO9ya3S2Elr_DJ>;Z+sf@%>K6)Yk~Cx_dU^ zElUbgEvaGto|=Ll1nukWH#Y}1PKmbBWX?y{0Xj-wgJOKI^zg^|tP14Q%*oxtR)mxc zPgu1HI3R`H15@`7I^vFKub-d)-oXoEE>?G&+7O-hk6!|3ijhO|yXnJLfK~5DH2sELmOKbwPZS z(eW`~ZCvDi>-JsRv+(b?(tYl|GBC^6zvA_YD17nVU3-leQK<3N4X5ey|QwDy&i5MagJScR|6-R2FzB97NJTLFS2*f_A5}79;<6syyS8o`_-A z(!OoTX7mJu$`#`x(#_!9pID|$zH+1@_3(G`XfrhU{kh#X zqzV@l^^O>iN8o(&y}H^y1>}1SwjR9UgXH>nS@-sppw)eh+K>JGKuJ<@bNkOU5UyHz zNc(smNG^dPY{kWZ@%|p4c$0cCJALELr6mjC|H!|odp;Gut@YjRX>bNnTXO8Cn_0R$ zqjT_3KvX69F7(vq7gr)`U3p=4ipCzPsC?`n|IiHcyt;Zqc+wD^<^CPz2hw42w6nZl zMmWwn_4aV2vp2GzKBN_R+MCSbqlnttmjEayzt|O~HA2p<=bR*XOL1TPTj`>Bb&z^0 zwG5NFu;j6#(0(BZ4Q2D|c1GvJ^v_8enIW0@Ovi7<1d)^Q<=KS~b(hkB)&u4DR#T3M ziCMvVUqTjI^kZLIjLjnFX8zx&KybbZ{w%>oBzRo}M~2`V5WQQX4@UG9i2N*(>mu?7 z#2t*-&k}nBVxA@5Sr30B@Z!i6Ol-4Qe3VJP&og}{eey~Hj*D8MU1AB4(af4_BEl3- zICH587dYUo=(`0!im#%%V`i@`eg%TBhRQWTFB@QNZD@Jkt1uY26k)K9%sG(T5_Zj( z!WKna=d@mRtp;mHF6_Zy9zpM=>#hCkG4Re)>Np9Bcx=?KvF&N*A@G9fw#b2Z5kO?Q zexYVG1MF04y!|QE8#+9`y~i|J95QZCGc!JH0hJ`f%|_3eW0kby9!6x|K295I$=16H z`iyRM^t1*8`6u5@w#78ViE@drs>3R{PnYDaaw`<7*q6;ly$ryIXN6n;c$K52qDzd2 zSAD>ZbjyGpw*t_!|M+Q0w2X68WASdSy zq0Qc{-5ybw!#!o-&cNKioQ;L_Lh^JoIT;(Y#`9hsw9r+M<@e61HePil~+cJ{}48^!DyzeNCr zBd?45=|r^fLpXk2D;RhR`kmQsNA~KBU+S&*i-Zh}?Ys^RF)&i%*0brMWcLG251a;|F!tuPO;O3z#ab@`_;Ja>>^RK<}kUBSD%VSCs3>rz3DRHyF<2Q{* z-9EYF{(KqAhZntIZES_QOIQ>B9nmbNcen*}eOnXz_M`yockSZh^S%x#XWuG|TbIMZ z<Ho}rP_P(3x#jrxlsNw~a z7qAFCX)op84Ow%)uV4XRAd_r6K}!=1j_CYZNc>#@Rg=niCxz2+@V-X}PX*;*vC#Xs z;>mX~w`Y5_%-52D%e&oDYNy-a#=DP=p`2}atTA!#NMI#K`jU^^{W6e5BA5Bcqk16E zxxB7CkOUeI#!Bt1^~EzLYlG?g*u$vfTfQX<*!-fomqJ&GRkA)59?j^7Eq`w^ zHSfp;jW*JIZa=z<)ypnE@NON0g7zODvV>HEy1Kx^y~EvDH6e7XNNNz^X=RdR{&*67 zf06u9hC2lh^{F0zrd@-VldYkyb~V)TnC*Y!<}0x!<%<7 z2=83#;3i!`$7bCQx?ha|UdQUj$nz`=1rns*v3NwiSYfxW$cp>6R=>JyaRgJ!p6a^& z`Vh#<9dQpkcplvvYX6?QMGkSx#7CrOo`D&eBsab;X?)Wyl41XA4)A8og7(^m8G7YH zU(UCtjDG!%%KGy>5bBtnxUkr#ft7UR3@!zm0J#p0DPiM1+$FRt28NOa%i z3=^ztGYsYMg>}l=P9=8ObZ!2$wW~Z>J=aRv?<)dikDO{|&C)&${x;=!0W4&4 z^Suh6X}@jt`ZdF;ukC{inZJWQ%%EP8d&P@tZrptS8OO%!FEYJ`JI)K|D3iaBqfOdt zs-X9<%G-TWdjJFXt29!6zl_qJpBK+66@c<8-Q`RCMW7nwza8FD0LGmUr0zHN)b$@>n0i=4~HDsiaD!}>F6>9AS%?A$ibP*j;g!KN)$t10dVQ6f7 zMq_tV3Ub^2o8q{02BK=Rcl)stj{;rmt8;)48hFy*ZW2Qe<9`Uyh=`DrNqT`+^=@%U zI{)`CR|+otkbX~TWRW-U<~W+nS(1i2OyB2LkDFn6v$BAryS)Lo%(^4%-uZUn=F9%Q zjNjyPKT2>Lozn-jCHq^p`&)t`Zp#x9_gvB72a5B1^`-HJBLc_FJanM&$BWl?%DaOV zPQ|e?4nH7U^jg}JJq$H384fgFW5oJ>y4!~BnPD4NW)tfX!6#ZL*E=Uokyc=kP_5(z zz_%~s+)Xws@W6xpkiLf5e||8+TSNF12+lXbpC!161h0$W$Pj!3qIXO5!HAv$k)I`U zT}0l1xPuYQM%(KKhdvwvggH2!<)?B6f+BnYvO19R6yIS$c=C+|(KlL?`#GmiH zBg-7JSR_+)vkpMD4%_HDZZD8G%d;fl9*Wj_&)z+Jm3+T2yLaI5tURRdXlmUdd} zexiv3uPAcqwvIXjvz9o8($QG3DCijKonVfRYg@I2zF?qQuYUZaFOYjN-S6eA2k0z*s4+YE1EW8Nj^3`yApO<0o!{17 z(fO1Nikl9u=!VZoo(J7!(06rb)k&W);K&-auX837d>!23PAQ!Oz76!n+*nKpu?BY^ zKQoC1Q-cnhd;>Dkb9cSRL;^ZIqsX<>E)|IR)ok_@;EUzf@aONzx#ce!BP-HrfU^2u z-T)pQHOh`|2}iljHyfqb6Tz%q*5=*-_9s*Ar0X?Cq3gnPo9}E~WAW6}L2KMGEOXbs zOBv+6%A%zC;DSb`BlNGRDxfK@euJrIK`c@oq+{b5iH#n~%l|giMz!N{0uOUGLz$FuWkA zvv-H@kxoV^wG#fcC*BFx6jK^n{Ob4WsykX#wB75quX!Yuns-~F{ zKz&W0>rQK<3h7iP|6SxA)^O`<-{}D0!+HIh9EB@*_4B}S(dXpe+OX}F-{%5BR4r>% z!WK7VMbRb7yAg_CuQyOXvNa?3&2{3>Sq^i(jcu=ufpzFC+I@Da?w)39I*`sxo(TFMtip!*$gYX z0_oP>_y6)0!_L{f@$Wa%VcG7!)`1#v#8hka+DgR-7#Id*RB8hNt&GOMxRv6M589#2 zr&wI!q2!}wl>tR|t}pWbb^?Yz_Hyq8V?pHcLQB^B4#?=^9DS3!18NhS|Kc!UfeiSZ z{8vaaNQrjnV8ll+Fwj&u#h@Dk3Xb&~JMH-?Qxi9HAa^IqmSxWyH%ba2cO zFa$634Q?2~KX1YVA3ifjBx=u3dlF*+BmF_?Q|!JlKD2(@r92|k!fr#?$sS6r{&7}@v90v+X% z()ytigyh1w4TU4UfYeKL{|K!btf6n+qWD<{%gud^vG-PmB}4l!30qnM0}ct=UUOdH z=Qbez<(n01bu9XwO49ky!}y;c?0?=G!lyuRz6t&;!9^r^T?9vl;2RLVTcQs}^c0Bv zERpLX@&?2mjM&c-djn#gCEi)nJAc1FUw8tSPgBd^n|FpA6Sp)o=0YLocmJNFccVbd z>nUn3^3O}^i3C3<#z$~Rf!Ud=4LA4}lnP4jb%lrP-!G2F+JXVE8_!JT$?sCK;ZU~! zUAUY!Z~KAU0jf$)Ri1Cu#%lMu{3C4q&}tR$4~g?0K*HOcj>aY%aj%z;xZ0h=ddE)r z7kXaCRddphBEl~~*ACKQ$r>k|dE;Gd(x5pS(RSQga7hOw_V967&O{=84>~~!(-7Dx zCUAzGoZFLk(oD^|Y(nPOA9*5AMZ%1Gcm@u{1R%;5UFozDE1>eYbA)-TKAJ5*@wz?A z7LmRdMkx#1VfJ5Lwj^&|JaPZWr||XTu>Y91?44^GQ0j|dG-SDq({m60^Du5tcKYWB zd!Ner&s*c*QvJ`T;Cr+FALqNLFzFwER<4`v9~V()lvjw%>%zR(Wu>GKdZE~ngiXEy zwhbHiA@^>v>UFc*tG9hoUS7jyPXUiu#PE^xv)EGt`hBkRMkfT5H}eMA^Y{4X9SoOU zGA$)Xt&!D_W0B;&0j$aY$$$R78IY=Cjz4u+1Wf0&@l_W2L6KSNyiW&YVO!KpOCk$9 zPNnKs5wW=d+RKJU%wL*=Oa?K)z{)$%5(i*>kJ=t~bU=>B6#Katp?fw%19hAfa9Bm45~=AX3ni%r9qk zQSv%9zngh3X0^8G3!yxVji^+Fi`Y-%z6E%UdSo};S{rM1$q%J z;H$JodyaBYQB$1-o^^|FKRvO8ZI(~$x_FA=8>aH`O#V-S)kIqxQavfUXl zTCN9#dB#J%Nu=GGLjK&mQ}DXfM_rg?kh*wg-V=cE$!l^4-B45l?f8r4Q|Pur8i(7* zV)%1wR@=TOfp9SO(Jx=GVq8(Q7H7648necQJh7Kl!4AELV@A%HL8SvS7P0m|P$-S# zMY8v49JRI(_QoW!0mw%W-BdwBoi#dH%;xb7y^NsA2 zy>^o~);tXQ%oRL1>R^ldoTfT*UPWWWzul?IVrS40mYKhJkR1O#Ig{MaqEEKYbzHZQ zHp6RaI(`xJVpxnVoAaHC1>$I|zc2+2L9?N?;;RN@YNloKpkLsF`$)sQ5Y$8{K_rRRDuV;=D&zm^nhNYzn|MK*x{^lk4tn|{6F8V z+5bF@gddFX)(}1gg7Z!AX9+GM!RsP8G6dg%=-m>1FrueG)`qDZ12CxD635f( zIz`-yKt(71E-sS$_AN#l+MFj1(a^Hg%`#6bjGBI(@u60Mblns&f91TOpfor4JzjDz zEHqkAi%kh#@##ub{b_;NBCn4eVRc1sh4)KYD4s@d-WNVk$ku~le@5NJr+n~Tf8A=B zt%%)DsA#-r-hX_xrk6jJoQD(*cCLD_+pbIgH_n=Hu(l{$uBa=-Y*yxeOuh@ zgW(so@|!&c2*92{OpL+knzURcIoE~Lms>aU2GILfh(dak7lJhv+2s8!xi_V^Ajl$z z=s2p?yx)+2uSeaoH{V&sHe{*y*%H55kS$1-=ED{qPl5)t06Mf4y&q6E1!r$pKlBV# zhI$5B0omhWxPJS5@_ANiylkv}=gYOr$P-t`Fg+vz>yL%0Z%m|dgYJmgIprK|V$$fP zz7yaoyPtv8-;bibc|ts0I_J@4@hx^d>IR6dvBX(&H!nV&p*a#aW)FFv4~v@cP{7&7 zd)n!VbP(9vGKpo0qPZtG8`Rz#fvYoZpO=4HBQ&Z`#p42@pF!XS)h}64f19-Q)o+}* zO!Q`UPy{!$pd5<6Cd>!ime0$#51mG-EA+!pHFd!S1uELVVHd&DVD1s~z@hd;#~u9| zJXi7bWkZG{%0;<_xL=YR8zH9ND?l4 zo&VGGvKM}FyZ+AU&{C*ZUj4mu*#QN|ea=`|^MlmI_bx8%D}zBkHCyHuLeRI5W^`Bf zUxR$FNP!yJWUkTu;B|S80Hk_>ZC8my2E4X(j=@30ANX0E^Eab#LSp4%d-<7)pkl9G zLiNTK9LkjJWG+;UDQu><-dYL+hVbS%OIk2eWEQSV8!3b~3;g`3HX9qhe|YT%49D@U zV|o@Aa33Fb0w@btrzhOy;+Crs(RYd34V_lU{(+^9b z#9&o;f?k|IvtAo_u!rkbn38>@3Ll=26x%?f$)55{402dkfi~hG_Z?id&x|tY z0ig?POLZSSF`xCI+l?-3IFUutF%sAUSN;F2ZVAvwuS5&hT$|KE;MwdRc55C`+xB9^ zcguOWBYi;Q%`t84S!wO(VId8l-;U~wWaq(zSDf(O5*|jv4@P)v2%iGM`6l?Y1Q(Iu zbrBpHf^R_dZizk^(NiGuvqY|o$QuxMFk(MT><#{#XaD!k<}rS;+$%Q2fv|-+(_5`%k%5m3+nmAxGJLd)D1swS~<; z`wCo^ZSfuRje2^6NNkxat;5@A1(VM{UGKNHM_A-U% zG0sTUd^SdgS_ZOlop$-uWdpm*uPI7#`rvyM<@yufY%t}Qj}rm5=CJct%#Tg4IDhvK z`%T}ii2WtHO%EfR7RqB+TY0@M2*Lu;-{48A0 zF4@d=L3xe&6h_haGCdDnl*o55_2g>TiRj5HPysa!2C{8teeW=O

;uxzrPP&dK4t{L)Mhqft2pGvTTyvQ(&pZ`Yr5)W*Os z;gz~nr9X((wrf{)Hh{y^8J7GT=CCv_e3Wk`h@6{O8jfk$2hUa)$iL41D>L{lNA9_g z67q~)Yo+uTK{;v(+xU4+&;jF(Sxbh!c<~G1dDV0Z7ax=FS!*?efiWGQP$e~H7@+;@ zF*YkV*=e!F-&6y;){khBW~$+CB?Ezm&@?z}Gkfsq6HhqH$#H#3p#n3>#@0O<*8^q4 z>FJlq9Oce0q6L31=iv{&bh~}2gTPz4o5$Yp<-^oQkDC0>WN2|H%H94D35Z?gzE(Z% z2VbtXo^NIjK^IljdT6^W0h8PCThOl*met_hKY-4mnIy z52S_ws@?0&Og#B8$2Hn5OOed=`c-z$QPUhQZ-nd>dzTD&4oR9RA0_)DKEKs^+iZ_A zg|DXa9q_<Eh~VGVdKYxncDWwX)ML$AJCecZZ7yyX6$E_KC1 zFrueGGQYIF)hBy_h;_H8^+d$40F zXQm9r_8F7fx|6{QX$)THj+N{5=i$ z_XOD)UX6phDWCPQUx>o1di*bKHY_pALz=#n6V|BE=KDHjkrV(IUVr=g(+|0)hUuCW zUj+-Qcwf{>66|={J9J0K9e*Y{1}Aa)VY>%5mM=dTLU);sEp_3>i0O~_yRxAWz@K;I z?k`z$aG27U{h4wC2(UVNs`;}Y5aTsx{Ip98485i{^0OfCCFw{{cMd3`u{C|g_y>*% zjjvu5>JCPC;&w-VD2hUV?>`x|PYOc^w_2-pFj@mK>fTNMEZ*p-webuyLnqs>a;{7J zfSKdrn;aSJ=16^N_#}B>sprl7^FcJ|${O12gTdUr-h2GzP5{2r_}9sz>EPv)Uz@ou zcyN1p=Dk>3#F3)??+#Wq!~1VPd*gssOOcd3`s-E(^ym~-FqpHKsH0*Y$3<;igO%Vn|Z5C`n*{E6S! zBobxBJr?O3@k5-a(j#M+T>;gymsy4AXuE#bm$mEaW|%re<11IC2%NKIEBolD1x(sG zToSF6z*~!>j=Q5Sp&w&HxsppXfHfn_ROJ*W>>Xm}!_=YFcQNP+gKI5y3SJW#*y z@mJ&Bd4R_G6vw(9}0XA6Gza&0$FRW022r~OU3lLh#4 z{b3<9$N?syIcrxqBGJqOzvyn6o3P;9BUbO%jo9Rf=M;mVCf-ATpONE(3wZv}K0ra4 z1O_BT_ou^1l$i2ZaOJ^8M0d67kgJa?+9+S07n%-4nhU+GWd|YKaki*-MAxR}DT z?uHREn}Xp*oxC{k+f1zDAzeV7wMc(V)B{~R+R5-|T<1P5x4Fsd#}z|uuyAJG%?U`0B&v!_@Jhz7W*m6n;I zP-(|g6>YlsD6O)SKG}N^*-h& z+4~Hv z8R9hjA)$_I?rx6Sn%9VAe9Q!&{~FB){v1oob7 zdnu#mVx}LnE{0;`ZrfAO(jkfc^6Ix3bU{A_1^qhLCGbA%iWuYEcC_j@T@mL)LSm!O zBv|sUqVK{=+5=54h;wFolk*L0)GIQMo{T_D_UqeZmsJv+#ywu$d;#2U4;u|MCW(c8C7E zgH2L+UlU(&M=e^1SjfymbksRI@+BQFTJ8_}_s*UoE$mxlFUIwAju#mBrlQ`5`811u zp{Or-8*Y%%L{{@3#Z%2Ruw$ubE>BYFmCCWe8B0#5?;oPz{;dgXr-p+7U)r*I>lZoTE+wuPE^P-+Jm~p*B10MV z+^*VwERG+J7>IUVPuvel;@fKAr#i?szu5nZu@Ja2JPZ11r-JwlHp0rCjF7{h1FbQF z`e3%~f%vX>thmnl-JM_jtk95-A|SYz20w7tdmi1GhWn4*d{s6ahz1_Mu_F83z|OsI z&v+s`&{!oz#$#KcNXEA+3%^WIU&q;m=_hW;sBiMCFhwl7LGzy5B_tc|uCY-a*H1^m zH8g^PZ9ZU0HK}-;QU+EEb>gsouZJG^hOYMDG{E}mxRb?dF>YI`r|Xb)M@uG`_g);Y z0L443_y2Ot!nuFDo4Gs#kX26HFKU)z$b6>I@A~0T82n(5!H3^AXyWYr$@^Rv(Idsf zrg5hsGJpPta^?Zq9G5%k+RIr6zt@$vG1id1OIv3xC>GLy*t?EzozYR~{WRM>h4302 z$tXy5Iw}XBWImnUK2ro;=CA%WmGA@Hu@t95$}_-KK^}+1;81kU{o~OMS}zPoxy5rxwDbEHd?x}k9S$X54D_nn#y{Y3uG^d%ngdz!#j7E0$<6h1L`NP z{;@^m!rJ9myLXg_L6$4eQdvG+MBIzQ^%2jU5DybuL?W59Y@;<9cl2l=s8csEX}5NU zZ%A(2&aheGmDuhvUUHv%Y`aU-;~*n&!8-GYb(00i;_$oB$CeD^?{SGdEsMh!JBK#d z=(W*`#?z29bPF=B)$t!Jf7_w0AjD4YFPR)U|2tT}(Rm65Z(hM?$xz}->aRJHV z@v;j628guFlGo^^JL)iQ2|q98g=+7*$iA|0#3|*8+EOk~xa*8(7+YB^CcNT=@0Rc| z5`HkkTSNF12+lXbpC!161h0$W$Pj!3qIXO5!HAv$k)I`UT}0l1xPuYQM%(KKh zn_d<2bf#kj-YH0#473V{f)B!)+I!t`3!I>6=(h#0&Wnf_%txWq#tLrjHUrS6bfeAU zc?i^b$Du22#Xk?@{Ej>S{9tEK26vOaHL#^gol>A69a%ls-{gGb zIkT`IWd1DNuA7?TAb$)qlW_O#7D7>;F@ z4=y+shy3(;HuJN1<#FTd{1rdoaO4ip`HWK_T-NB{9n8_my>D|r`?X@^-`;@qN7H41 z$`VYu7tuWFcLxq`R)E%~5oXO-blt#4P)u^kwB_4bOzTS@#dmct7}~FSRDjHHQk3kf zCznY4Av@+rifyM{l-kG90_TeO6Q(Ll^@7tr2ICsQX5|YfV zejQtI?p!tFGQoK&3oIQ)%$VoCns2l@C$?crx*K&@A1nxE3l2$!fH6)3lRKY+flKXY z^EdV+pc7Or`};>WbeHh@eZxQyYHPnQTiGR!cYIWO^pS-ZuEZBdQ0T489ku#=IDNel zzrY>)I1+B+olSH<4%1d*wk=s1wsB;y^XYc|CtM{^%kKVBRY@D*J7W3r8&5f|Fy@ib zP%DAkOG?{RRLsCv(-p3Ph8n!`+o*{me?`-5?CXHv7@`nr{0H0;0Q78(?c&>^lphh7|~N8^0P#)i^v-gcQ9fW&MrEIjw{23dB$D{3U zIeysDTmfOWjXCpwT`|ukiCp6db4+nDmiEu9D^Od@@0MVICk~Lie9z-uJktDG;=eFN z_I1rixgOHc25lDRT{B-ou^;=neM6)u=n`h0_%rd-97j<1wFH*D}u5e>VUH?y)IbwDX-f0jPtmHLWZkd_dK@Sb&OV`{r`A64}UJdu#dN>BxIB*+1c6ia1gTh z-g}e1XSVE3vQkQZg`{wDBs54Rl~j_Z87hU+bL)BT=WpP3-RFBApPO^n<}r=y*+CF9VBTo;il#P)8@GOUhs9ok zjMLRWmrGP|EoDkLO^JT%+(!{Fe-^0vYO%C;T@12ue$y-2D}!HmSE_UfwBw+^wlxa|?Ko9J ze!Y~j0ekA3d}Bd{m>+s~Fs-)2ain|)blai@%3g~~n?g7yOMhOnvK8*o$$FJ?h?o~D zkL0|!U4+VNs$7|!InYvN02zkxp|Xw>cNydx;PK1H`=f*t!Ko$e7ceJ*CWkFQ*BCcK zs!Au>47mba7Ggkk_I(2uZ)xHm?{P<+l1U8o@l9|=c&hGv<$3IB{6pw8 zTMpJg-@%KEl2H&_y)WgFSl}9!ed}_#6Kd%l)1eCrf(NNN`+~nVqOiUTE8sM?Qc{{B{2OQD4`*0Rl0Bp|p*abrE@NV15_&_Hj&!Rr4 z7r93X=+``bkxHM7V zr%~eEZfXO)Z6k_?+{5s=F&m3S$^uu3t=_PJZ1whg(PWo<14JL|3ABB7Tm0KO=ok8l!OKB|zgnxfY7YBK;sAwTm@rRGNpPUn->r zl_ydl-P3ff4%$p?P|_5(?pcH>6X@Rj6(}P(j-jmcAP&Xc{4-s@uK*RdUGfqSN=58H zrd#!WBZ=?1?U}=C=FsWL_5j6?vCz@ER7~u1B3{m@FtF~7$1}YV{OkS+P;xib^I1J! zxG?wMg^P!IA>($&%EO`zIPj>e!~PIOEaItg^QEy7o?tlk&nwQ}ZNGohcRS*DED0v*($TVe{(j>sx9Z%%t@Gh zx8_W~OaPMREC^-{PA2xA4_Xy2ao`wEs~5lOPGh$3jKQ>u1#m=Quc!1KaljIIRD9Q6 zGY}`3Fq^e#faaW-nAHsbD(yKa8`t?&6=vB*op^lQ5w}lhy1nY9!`Yu!+2(tB0h`xO zOLij9HAJsutSFNW_V3@&T2ErPW13c8TMOJvqdF_bz1OR?Jm8az7)1rvQQw z9Tn|dPbWmvddOROoEsgvvE+Y_$j3!h&OYX?{MpTT^0c{Tts(Y)s}krI8wPg-qlXhT z%y?H=>sH=lbnwraR}_mqJy2l!_T(p@9ym@-F>2l33@=@E&>oPkgxf;TKC8}a!~DKy z-+xK+2g4q&Uae-0(D=*oBbR0`Xxr?~w9n%^tjm=Fk$>kBt=gf!yU6WV%@bQ>V|z)s5^ z<8m~dl{J=;mJS9C^4*M|Ho&r<)0N(yd3c~JgK{K35k~BU2|Hqn!E+r}>+8SsP-W55 zhwcBJM9yv%vc5Z1p-6R2cwkH&J|}ej{6}%ZOTA~KJx8z>Qws7uxU6XlxwoWLjBRy= z@*IT=x?v`8tXsake!B?_z2j8PMf{!WmbBtR17r}#_;0mi%+k0*h^kmRTp4zne81h0 zN^m#p&*yx2q=UEZ(jDB*;*P6trnxGw`J#j#`m)+9Cot91yIGYvhDd;?gyq6TNl>`+ zBg^e9w)WW2KejVojl4ejZ$T8^^%mO&}&^w-%BMTs9#Th zoV||~9qz}~liEV4COUHV>Mm83wS2ygi_Q~Vl6xX_KrtME`{A1&MojN(D1QIdN${&Y zi8e_r4Ma5j*z_sjLET49ZlM97b!wo*T)PN#x_)>@a1mi2)wbN?94GJ}Mdq*bTxY?r z4)s?A-vCBFKVBNkl!zuvQvdbA?(K;G*HdWaFgnjSYmI(y=w%DgokYnq8-b6F4bkHc z^?!FT89x`v&HXI<_L6M_FM?1^lqaY9Nn+2fQ0bLzrh+#3WzKB-SPA13M#Thi5Jq

340#H1X8e2H!@*9PP)Ebq=>yn_=9#EdL`O`C=bCfjgCgVTWMr^ zDCZ2^#d1CFnL!FFFB2%SAUx}RdmkM>LF)!03B{v!V>1>y_`oxx_yUyR*^t@y&K_;k z=4OzMD+bL8_@x!PpHIUF=?_hi1_49+|*U`@Vt9B{kwdC2{@ z19_a3RR4P=1v#r02Yk2j295F&t=Ga_(Sst^H?@?ZNa!#f_f_pMG_C*oTA!m0D81tD z#K2394BvXwJUQ=)&1aJOPv15N0dD&=Plc(Yoa>L%Q^lh3o|>H^_1qyK(#1KL)sVP@ zZ3uACJ11jLgWBn-m3S~oUG%_d#RcF0VDi92hu}fKDf+LD4WjSq_j(7Mj-u)Q#RH5_AVFDs7P&eR6-3Rz2c=^elXHoL;4g*&Ns=QCAo+suZ!f!kbDEOcT4uc$ese3pCxl$WZrUZP{#q>#Q8Qob;i^eO)G-d$pZ1yK;UDk+ znD<)1fp_P_o=RvE+&T?!rm{H5Q+~4Srlc~uedX3|iaElI)2e$SCD9F-Zg+fo=z9uS zZfwtPKkS5f+49!LFWUpQ6YosZJP04&O0L|FHgE97Ap3}17U7dk>PR>^6^}w%eeKFL zHIR|2;9JkH#o$S*$D`9G8R*W4+@|jqez(xu&wJwx>Wi+dja5rPJSH5Q-Wmw1)*_{e zyYU^XDM=kqJ2bxbk3S1bZtq=)@s9(oR-YVmYBPX-neQsWk%9Z7nM4(9i$J|=vEF9y z*3|skzdqPlySMg#Z8xBv16BKD#G?qI_ToewWc6{9zN z+XC_~<|50}EPtgJ-4S>1i+^*L#`vcGksa4@vR+{M`w#U_v# zG05?Su`u-ByHj{~7{D<2$J^zXS&-=l-N9=&h#s7jm;rZP01B()d^r<&iufs6eQn$I zB=jk%a#Y2`8s5-zJ|u*6a8_X&OUcnoa6Vx_kUU%m9$2}`^uA3%c~{c*)-)!Aq)u0*ufAH=umB~Ju6UcJw^*{3>3Fx-3*kcJ6AH;L?(M?14RHSsm+jz}34-M}u&FdgM zCe&el8!ge9;Bj-x+2pKJ{OqTg4P%=H>a4Gb%+Ra@_gvqJ|E;UTy<4YuKemho+Jz=f z>)e-t^;tFnmqkL&BsyFC{%tYfb&k?)`CS3Qs_=u}1F10Gs+uq4q#Nqq_WJxm4P3ttC!|#BZ>a{Lo|N4NEas(< zW@G>NbC)wPuWqB0?Q1u*@U=n49A%@&Dh+t>#`=e% zJAok8azn03Cmb}pT0Z$AorLaK3m!KJs)Ms$RV8L6)i@8c^Lr$?z#reX$EK^S~IqaBUoI>TolZ*7F zxiI<4VNRwe{`l6F`BUZ3JfIef0XEX}fCKU(>tAJquo6Fe7|-zmxYBS@tkY!xPRh(? zhOZ96PG+&$(sLs)b*gbF{rF3`C?0s)Pk9`UIqSwao_z|9dA!_v4yVA9MAiJYosCFL zP+-${i}QZyC=(t=Siaz!Kjn1=6;W4hdTVfFQ2Hq0Q@}N!?;K@+P>rSqJvRBXc%Z+3 zlZ%Mg*DL(Tbqml3o(&a(BZJR>cXA5z$^<~e`EIP5x z{OsW_{sDo(RIqDjsk+sK706i}{C5X?Ui9N2-rz%k48Ha} z0x7}g8y8_MR4@%?ewY{lsbdWI_KLehkkogNu^J} zxN@-h70Z501t&O}B3nI^>j|xQJj)kf^T6Ua(cPF(2c)!~PyLk0N#^{VYCm!y0-QUz zJxs*$0@e=ZYTSFx4pLaYxnsH*2)`VSjSuIHz{>u5%46?bk+f{Ocv4dupw%6(_&AsX zz#&G*Z<$%(Opl38V08ifLAymbG~NulzSDow!fTIDydKEg%AkYu)C^)?6P|bPRG-8k z+=exG8qD^pwPV{~JJkF3oy9I#dpd@_4qE$+8YHQgL3itd|FV9^f%ykl)Shf5dbCry zUR{$VFeH1IS&sZEkY_Ja^Nr}I{h}-Ub={;G-`{66*m*JoJQKQK=C`E*ikdB4GJKwd zfK;Kh(=%J3Yr0FZxi zHgABLLG;tU)QK>;w~OJ3Xeo4GUVc3&837hXR_&v2I)XVF^O^7@L$IjYzCP1^7UrcM z>NjmVjZ2QvOE;us;K;$SBMlq{_&fW@4YRUKu*m0}#{fTu?hbdqIGh@Q(zoLqw0H3LJA=Jq~RV<K58-usen58G6;n?bRk;%JCcAiO_zFVlcyZZ91SrWQRQPx8E z!EhN<_@=i8(oCte+ewF@d;Whe5u9)6*v+xYpM{fi?lOGm+=1su+D%>;+`&l0b6VU5 zJlme|Gu74!xEeT^ZT4>K-rxGy2a`T#`>&^PnzMnT=Eq6Ed)(ltJBu$eq)GdiH<17M z?cW`2!s__H{p`&UIx7XCQ^3D%Z=Lq_9OS!a%fZb#J2yxDY|p`F>?&IOIwiIcTQ}JU zXX>Z}!6)9lDGnLv=DMAb3E{^P0R6l5SPWoTD=&bmX|UkRWFA+!F=pAvlC3-M1ZO&| zggGhAk*o8-g3{XvQ;W=8v9(al135_wfZ_hKagMI#Kbk zRd(sH`A9erI48N#q@)bmYI`i&8WKU-eaC0jvXP)iSM_C^i!pX)y5aQgK@R@TxZ?|- zhb1)RH=FlnZ3Q zxuB7pMdcho`TO7tDUN11Fe7>=y}%LFip=uQ5S;hV*)2@X%5{)_G-2{DUpDS_^&e_o zEXOx;{dkmaCLjYiDast41AZo4JWDry30^HsB)ARL1drww)ep@Gyh7hMo;v1+b{%hd z_?+NcQdKgXb=|HHu8L)73o|4@9}!G()T|E6M5%T8>xV=AfNZvK*=ShQnHGIxe=|N( z=W_MGv>Uj`GKF@qsRdV_mow-%ScApObeOg$pTSqpwjC=+o@itII*aO+Gq9P@vCo>d z9`AUQvdd8^4q1O-?26^N3wUGr0&wYcX! zNAXv)b8v`LQ^57TA3PdYKd?b?CfHm`BWxsG0DY&y?k+(l^qBd3p|NHl-X(TPZ$_aA z3zY4fr{pStQPCqqJ`Dj_^se4%1$lSupK{E-MZq02mu1r5-m8Gy&G!fG(r4>x$$ed3vh@D-OnqN`u|>W(sxUG7)d`E z>8&As3MA*7@UkVd$9C)T15zDU4%5ByJl;Y)&J%bd*jTsfeUd1P7iZ+5uxYsSDcSp-+s z);?U0)567E)Du2BmiP;w)s0k39sEZP zhQHOB*k5zyBjLmc!UPu)XXStRwj3ab4h@g{Xiyk{yoZJBn|y;^e>)NVb`Y&(b|B&Y29gJZvSIIEf6SXzd8s~8)fDhxs z|K79Zrsw|6*=6;x0D7|;_&Q2B-h)pEP<&ATu_0j(TAWXHd~&r%e=_MCKG^W%Gh9zT zewZ_aLzL9oWrUwWw>=;Yo>#@0bX8~9Zn=W?J#zM{jdtLj{V!GDq9C9@f7aO!YhuJQ zG;;As5GP-nGyc?)Po8_=YFdK$?ow8^EMwM z|IXZ%TX+=4;L5GH-Ar-!))K1juhGy}?w8wpXHnc68`AkqS`nId==%1_^}!MX{TC6} z2W!4;Uk;GJ37kkUZsjY}qE(77gc(@lj#E(j{EDLZ>r}X_QFEv~I1&b@3Pqhe7zzcJf3(S6aDxiB zn7)0?wgb|=hTW^Q0ho>@uxU-i4mx&eZ__*E1pah2SUtKBj=y=`{V_)FS0 z@o8sZdLaKS^<7o4s&G7DMK=o0D6Sox*D{4gwc>u%vlMXXXFg4=rwK@V>i$?mT^W7d zA9=0&EDv&?KC-4P;evll3Aa_N1R z)tf#Gyda)NTh($370m3mwUzWmq*t8u-I5+g(ho*@Ye=60$@wPvvm_UhLTUXf;KgqL4chI2k8s>T3dOcjOG7xe*A2 zXW{cJ>iTHtu}Mhtk{)so=K!DXaRWUG887PrTR_oY_V-q}KcEZygHP=D1U7Yj<|xBbv(g4|c{->a9C2XPEh)kV)00DatJd!i4fn`?<)wIK`=~o$Hk7oz6yb&LjaZm+eUJzIJxpUO zVtlB>>hR8fx+h8x=`O__5K+cem6whVpJ0G7W)Gvz4er2kEUaZQOD4#f>YS>IfD!n? zHTAG(h83+$Pk5geR6t_+XXx%Za{waIT8`u=t-NzR)W)zbmultdLg7gK=J6HNhqrFTc%-8wgP3k zLf6@Zyue=3uPp3-8TjaIi-iPN4!r*;>&We#&W(>E#YHcF*DyiXAAP4gz@rLO2O32Zjyr>Mxf9~Vfk~# z$8NTIOLGDe$f*k?`!<+6uD4Pkx!ZFyyOQI4zKa)lm#IR(c;4y zgwOP`$F7HCIppw$)Dj+4CgyrIS4?`vN#8B$VI=)vq_>9jDUh6Rl0Qpw5lLPb$&n%X24wG+?1Pa# z1u{QN=DNtd0eJ@__p{{QfPBxAbCwf_`@4$v!CjVsH*?7tuINAaJ?x$cTHoAXUu6qI z3fpw|jU|PH`p(oh29(*r*@xEe9j_IJ~IY!;eGiU|Yx2eX(GhRHuo; zx&yMkB&T~xBLeKMe_7k|C;^CBw|ieNbHL~HwpVk75_`dn%h#`jdSd%uALp*g$>Ac0 zJeJ`rgqQvD$z?rVV?;gc*zcmg$)ANG?p5Ig7ZK-P)Z4R%fex*$ z|DZTPcUj2iAapbN1#I+b1LZ#yG6h zJWZ>K2L9*q*`7Lr52y=HBm_|b@OaWy&zYaNSJ5BY8=QO)@SglRcy3^=TWt6FJwk^Q zakH#Y*<7b6uJ_Kou6a2WP(7{|eiadnR3FdZtrs*zyC;5qZ0$Y{p4r#)uWk%0MQ2eS zPSILcTHA8NaXHop0%@A_`Nk#Pe1ZzcZ;h&>Cq(}Ap|2@Wvb)k#tfUAAPbw^adcy=X zb9QKQq|yQ1Yn=?$Ukea}{j-yaU3JL7_|7&xyIN!$tG1L~l8;t={T&4UWPvTC{HgJw zIk-l(Jhnh97}3{hY27$e0~Qm)@bBFPcwS^z>bMYbHw?dXmzU_>#yFIGbt#O&xh6{r zo!JEER+j5xS9v+~=k(pn@9PKK=q=0(w|N4-<>?#thvUIRSG@w6*Qv<%8uwIqYP|=;!2CI0w57=hMYJ6}Gjo;TE!f|A zHBe!KiURIV`*FFWO2tU`i-O!>@oli9lHz{+>U)Lxs;nvS_P<(aq38k@^;uG^hUD?a zP|It1TW8SR)OD1)Dju*DuFP|!cT{v~6dGX7*%#3K0I5~g1crzp#FZD`7wPCGSU;^ML0Y1SmJycNE4bwk5yDU*=-b#VrOHHt{ z6tm0E+W=@EKDhAXUOose2%r@S%>(NjoKYHof{|}vug??jb4Zr0X8QGf8yaP_pZ=oI zfqqgPwpf(ChWO}nl){Fx&}mjV>F=vKcxaWC)~U1xENOAJtLo$-xx>@uVQ)|4ncJLy z48K(Y(ko8-Zb=U#=?5ddHKb30>P@z9aTW zhNox)Q;)#mrakvnDU@ND^aaTzm78!{Y-ww;j2hxgv47&d-5V^dReubd3_;nm;_qe=;nLN^89q{*1XgLa`O|9rPfEcFkwSP4Ji>8_SRS3jUS8~)E* z6F2#%ttqJ%j9EQSxZ@(#|PZSDjj)D?9u4W&(vbB63CjJb(3#^ zYgPXBZn5Og0hNvjPjq5$*cBHwA7nZiQ&$XBi2Pk`H<9bY_QEThc>^2;5{sjMoJC4@ z%mvmwqR4XXmP4bq8PfgP`|mw_A&KhWoZZ>;cbEqU!GIF)n-!H#K=s*z5Feu&_^u$R z_`O6LY-`v@KS@P{#@~9Ez38BUJ(*9y)Y@St^`UDJJ3iX8B+D zfIB*?J-F){HwzGV-EYhP%oJq4wj$oa0_c18Sm;?1MKmJu6^`lDASzRzZSU2ZPz8(X z%n!XwNZ!VD-&L)1XpZUUJSST@06VTdYkL!b+hxCXW1CQ9R_Qc!=lErC^fzUE?WIU8 z7UgIi{yYVB2FCjZ>oz3M$%{o#MA!{%Ipr|`Ctt#t3S zQo+YX{=~6(Xeg0idnjF|pGn8BL3_*b4xK4IC@P}JH>aY8^P;xY8bvXpahJ)TUn|7H><<4)nsx?op1!?)-8K+9 zQ~%bdc;JiG^iyR%M_3{1Y+L#R_0~v3W!g$E$qcaT6~s>!S;0crxa@`dw)k(CYf$ov z3gmR$`Ri1PE)ehfGtQQB6m*{+6FzX%5^+EI9{FU|2zomRZl9o>nS=CINpuG(VO>QPG6QvAU(u@mI)f*M%tfL}k182yY?5 zJ07^hEs%uEXJ1PSWhcXIpLui+7K2onB2DqY2YcQzq!{5k1xc?s>ANL8jHDlo^wy9* z1(Nek@@GjdBFXC_IWi>Qfb89peK4}8K;~!3To;))An#!0ewN%DkndS?&dOTenY`h9 z2mWW<6D!rJ1O-BVw9j}r!vgt;Y0ht!aB+`u+Dfz;$b53(?bCq(@SS7cl8=oO4d=BL zaanah0l%VijEnsE^pRq*Pvz$L7DK#;JBKSg#XS*qGK=7y$5GMhWCoy`@QJYw$r**X zz3C4HsI!oc*oVI-&z^*%Z_U>N{2bs12NUhACRJFY8+n)ij1P8ZEh~m5M$ptwz1v*C z7|$>7`!m@bn0aL9#xtiPI4Jhp)>SeOX4W(O zw&%FhopF%i5OrD_=$T*ByJeG)$FqGeBs%gy?eW-7v-BJ=7*y~$duJ6^`m8e8ZmkXH z#2Z^Kv*rNt;GO?`x4!-CUYj1qWwTwT5#N%4zQ)QwZ%xd~?qUutQ!u=o`1_!cF&gDR zvB{r>dBL}Q2`(Z`7~-BKcwP9i^Zv=0C}lt;>H$TV?U7@*+h*?;O2&E~BKlx(h1s5( z=qYqJtsH-!@E;>OrWe*~tQ3H_`;Kkq4WLxx^PR*U4EMGEyiDw8;ggqCnIaeYk(-En zs?8BARAb#>u(wqk9Q^RTazH%}=V^(W8I@AvqmvwCC*^kF`$2MS;P{g%dB#ZW|f=xRbyiX$-EsqdWXSb*hEbPeWS^21TyN4pT=`x6Z? zWX%p#&Xn&=D3 z+cC*mG+P0A6?1{7UP;(``=JqPmr}^hQ$W2xFb<}(mEPiJ_b0f;<8R-u3BabGBj}Zx zD}4P@*w@n{5-##cwY|%XgVMX>p6{oVfSgR0YuJ1=58Vs(VVOaSdIfh1NI3$zNfJGw$88qcJ?4i@n&=Ys}?Fc zxZl+Oiw2}^m!t4cv%pvK#`dsN@IWsEO{YJUk-)!>YrVguA2h^S>Yq-I8F=786*Z~Tc~Q9S zxGSHrK>}_JelA)c6OHL3V;)+>72rpGV`}j|`QW%tcG|6IB^r682n;*pkEng z=QkcKnsCPM+zo6W)V$D^R?VR=1CB^mq1j7#I2kPs^-vzIRSHJP_g;(Y z@>XRyS6;w8|DOS5LG2F3uat>?&p_0;vkBf}^~&Jm$r#8GafVib-VaS=4rDrR{nGt( zNU~_|a3-Ai#!`E`&KFVBUcIOT6rh!#=>{up8r+`>tfNeb@AiVc@TOP1sdDO{@3!E- zMXLpE3y|xz@XrsH`0SU?kwPZ0-^iK!wXPi4f1Upj!TE-hSz4R?S!~>!vB^awyuv?^ zZ&%^~xr0-)F)}i!FKi>4;2S_BTD#f1#oZ-)H~V1N+?? zy0Ekr|7P9*e{b#Iyo2FHXAjphbt=$y$#eDYJ`FUIpDKR9MiHro>YHxP+153i*Rp|u zkiF+?%!>gEsCMMCS5-L^^qc`>#@AU9_osVbBIoqMM3v|3oh$YrYPfMr9HXY9kG(6kWzpT7lHYjYf&*4MVBlr{ljKd0VHO_bxAA^N|pF9Kk{c+)2b z9S`E(cz6HVxHGtmDv_rl$QsgbzbkD|?BAbnNZt}MJcT)DyNjO3x*!joUtdaQP0^@- zLn_;J9A@_ql9w{6h3w`Xqy4o3kg=Q9)OR=vwQjMd+*dCJG(*_Dzqxrpy~uZ6+ZPD$ zbl6Y5ksT4x@=W3E1xZ)nCjxax`VOIpZ5pK`cg)a0XoBnBwo-VbT=4nbH|daSCP3jw zZy~1r<)f;omx&K4F+RHA=Y+SleLjgx0&wM(-j}mset0ZpOm0=z5w|#8H1Xrn!9QAP zv@?8_k;s(T02{#tZ5m&_{z%dmBd~Y7W~LdMi{f=F6Y+%;Wso^xQW~%S=`6lJVv4Go z%0AB${5>h9V^xdEYUsud-NQfrih-shCEeT39>;GfF_7cTmAc|0za{omVn zR_MaDEGFL562Sc$`u?RX5N`z1y$<$13B!aAEBR@?Q`)wBVKQw+7gfAgXRq6>0&ZQm zi7%Pq1v}ikruL60z-oyvYwmpx*mHFt^|_r9+^ce4E?9sUhx`nAxnL*%vL25+&LvGB*@*%Dc=luzNH{EfK_0%_R!#ZP_?!oO(sC|jnBApyr;ZeA?A z>Vni~#ew{JLp%kZZFBC(zyr$knOm1r(UpsBS9eG{K-qf?d(61gu=vNt!KT45q)|D! zH7L~^l3sDrcT0L0Nk16rts#913IE6WCi$}@7m?(3ksKM4Z$S2L$vzm_Qy}xRWUhcaJB486MGBXz=dsDyw~TDYtk&BrVxb;|7JAMtGyP3N2Rp*q zq6eJNC)UscHE}nTz=VTp4miS+>6|@}r%&QWzm{&}aUcA`{7P?YgEwr$LK9X=ZkXNs zUQ+O-tyrM%^@WA#t(fnHr=d^$ni7@mA#v~ABQW`_WBeT(Q8+aDEd0YAKK#o~ZeRUA z2i#MmL4@nqyD6t<->1EgMhlB(@-?f+v6;fPr1Qay*ia)Uc7QDr&G;K?ekc>e=EpwY zb#757`ovEg+AhYS zt`K|!r55_Q&E754IkRyiBupCFgebq!+ocA)wO6Z%{48dpj@it0!JD~sn|T9xk>jjm zDZ?KHl$>P!@#8#!7<_#u&r1ZD%T_(xe9so;-S{_We{4Bz#SM+|%=Tku@xDL14}>Os zY)KJAwKv9!LrgV6euL44G{!0;O%HG>i4^nB??=OBX_?h8#)b0p$Z-ZpO@tgm-t zZHuZR0WPHYev8m4Iwnwmv133xOav_o3WW!=nWD8VX`Nj8kGg5=Bzjd8-z()AdMfUX z@_`~p4+h=!UQt@xqfggJ_eg19bhM%Un=1Su-Xo&)TN&2e&aIQD=LU?A~?^#@EB090_0n3DaBD}1DvrTq} zzzagc_YZH4MMkAL-(*zWp|Hw0b!auQ=X_Un=fg)wRG~24MDbrCl^CVgGp~<8KJwxXP!~Zk>ge z(>rc&3S6qQ>_7?3h>v$*wuk(BK=HlhjXFoJ6yrJ}MA1)L5Oc*PF>J<`Z zglhRO4?fHV@Pm^1?5`_AaM3VIH*O1YH_o^oB*FbnNzJM|b1dEvi5E7xO?!#r@KptY zm?c_xRGD3VTfQq|oLP_46*j<4qQ6u=O={v(yrYbBR|AlcF7uwESW`UrcKT!UOGCtc zj&p_Km@0T9vb&|*Lk!xtylZ)r&V!Hs5Oj6=HK{Zn9AIsz8O2$ffJ z4lqEY>dT|Q0E-#CvGexZ3UBRZ=uf_3fhhGn=yyq)gO~GZdhYt{NOq$yH|&ryJ`^Uv zzBe4e3xfm0-JZO744;?v728nC)YBL>Q}3 zang57dKgJR80oDceF`M!o8-@uTtt%BMRH_Fz5&_0CHr7xPl3$OlDRH2Z$RF`$o(w2 zHz41$U@Z3unU+MBK z9_f2kTek^`z(c2|etX93z}^&r+<~0QXzc2)tDkb#yVv6LX!g<@Lbd19^n(+%+F#$E#J*t7wmtzJr#YWM^`k$rZv}P!2X^2 zqH;VsNH>|yd~)zbT7nub1ls67%LYgwuQ7MbP#oQVAXr&0 zZ2=BeahmP?sg56b2~xjP*M}8@M<2e^k;V02)h^5%8^hEDC)c%gG2kg=;95hxZ^xDd ztF3rM0gb^YtHy_7K(j&9@30_>-Fuwo;N=i()17&CHc*&#(4f3eJ<}AJ2aAl^6b-2^}vbv|ip{hU<8#KaW}Qz&hJ-`>C%?aP6v|M^2r{~I!Y8W z{$+f!P9+OB9*^bDjszkL29C`0**tI84>ALJk6)>X`+C2Y@9(P|oA?=dC zphW$!w7g7nP04!CyGw%R0ciK2ViuK+9oVkHa{C*PD(>W6?{AO$sZ>)qZTxMB2`}|b zF|rdm&a%9Lqt*$gpoqqALCS~?TwtN8a9ZYq@^P-kC#>l)ZpEw zB_ED_@?)BLJNGkUP53k_^LQa}(xL+=0;U z!K3q^0)d0wMw!)GGW7o>D!3m8f^qF!rkjJ2!0Ddmf>clxB)#IK@0Rp1l76tP|MS+6 zJ_VBVP4Z_+E+Wb6A~`Z7-+=7hl6^3;r$FXs$y^tiHz4m|V{n&FDtG(2jjhehm5?JEHDMTo1PqpIiOwnx%!J`pU=QIYGqoKRliHKak)1#}y4kNXRBDl~Gn4j=e*&vp3m$XYZ`+A}b*w z6_t~t2uUg-X(&l0l_ZqW_x65#?(aX)531w3uj}=CK4U0|HW5rRe$;FQt6bCG8LI)n zp;;GRo!|imo}2vY-D8V-;!2=9>13oBIoFQAE1M-K@Uebiav&_%fcMr)zp zr9k;DlicGp_F%}@_tU|YAW(Ra>%{fJFfhj_N7qRi3*KIvoQtalD4=^j)KyOuFI}oz z`z`B$=cXCRU1o53G$z#Pd4)g4?-RmZr;Hb z1#>gBHN}9Yj9m#=h!iq)*sIPhtBMTKL(b}k9YFYn=-kIeKbXAd<{#ENbu2^4Xj4G1 z2{lz0n1*FE(ak@$hhja=kc6|N`1@gk4>f9Z)q?nWAhprURNLsVt^35CEW#6%V-zzQ zwO&FM3(zU@oFE;&c0)pV>+V6^{EBi-M@Ms8eO6Zphf;*J?2FD z)0k?Ef4LQRWR}8YH{QNeY{cK!#wm4yAtL9x_^Wa~-5U(mm2c@~%fi*-`;L`eDT05% z9-H^fMo60{`{t2F7Z8R!NJftzM*}D zhjPtGDUit=?7Eq6%k zOP&Vx=iE$={S3HK07_-Ae;1&Pam` zWxrB)204k>n#IM8VIOO1m(U#1pYf8N@Vw6lcUWKMn&wx)+J){)qC6BxqG-+InWh5T z`YK~9fAVp_R5RiHHDbLp{mqx)OI235;yGpC4nsY#*I6qh^O^&C(Q!5HU5*Ku)+&2< z(GjNCixsu4df6qKLMyP<2;0c_60eTlv56Mxu(p6^pZ&5wW{=#2%!V zMYg*&#%%6nq0=bsgOgMsR%Tf|WEABN21|HLz1_;t+Zm6I*030yyc}F%^(z8sR7sva z*;$BayVlabI7H)#XCZPqhf_f!`{{}Ax8qT2yDB@h55V}`)3y?`Xh3?!N#8B$VI=)v zq_>9jDUh6Rl0Qpw5lLPb$&n%X24wG+?1Pa#1u{QN=DNtd0eJ@__p{{QfShN^clNH3 z1RQrokYZ2$4e0<$D3LYQ_1ZfCun+C(IO-dQyr!md-aqmN8b`mFsd=S>)WS$zK?QeQ zJ~xWovjd=uUC8dhUIED9^|9uVq8X0*s(w;q#R-I{KA6lnpo4neIZ)s3bVK^|@0*mh z{lQT-f^5I!2>8F$8oKN-L}q$lwRx70BlUbkCa*pk6q|YQachn#$m}{bS#;VL+%fkN z$|wv4d*2RT8!GVxG95k(y>Vv%e}-68g^djUYv;z4W#ow*bD0~znI;0Ub3>=;paQ<% z_}jIhBNSOYA@&e^lR;y2;l24-11x&&+TN`b-e}~AlPrfp2w0HT@DpPuyzWz}f%GNb zh{3jh(<=_S`{q6pzFYiloGV(~;y4J4JM^>loE3LEM>qHA1ASlH>bh(r zP%pT3|K$xvJVK?2{|2D%?8(wEm=Cmt_YMV`o1mGc z<>Y-AB++46m;c_`nIpQMCx|{*fIa2OuI?k4nj_VD*hUW=8VF34Ipl{LS%b&kDVU?d zT}M80(wQJno{oeQTPIaMX5U-U&nqeSjD zc#-}!9P?caWtX*4D9TIojLcSZ|9f8d)A#p7+L66M~KJF1EdY>QtC zy(q*SYbld#0=f8hKSh5>Loqy2bCv7I+2i2JwgUZ|`oZ9>S8T+Gia_*%-@dupT^ec7 zkMRCu_Cx$fKApetEEv^zP&=p@oJO8>9a~J-<%t~RnYnitwUAz}(QJ)6(M#Cd&_A80 z3buU;`+31z3keMisn5J6_8Dr=d7^jOfPk_WK6W8)Ks#IP>n;?9Gx>U#t^dd(wmT>H zQk&V}4;2!h4fTlcy3@m*dzDmxjj2zJ3cCbqpgvB;)Jo)|c|;%3UG>8+{o@(VhQz|6 z?5|n*>(ao$v-%e2lnDwFpq#52F$bCZWE^}LtgzW1^P2C=K9K**nJ(>X#Q$Ta-z}jj z#)!u5UBpgjO<*aw^Fmjk0nqk;6<|nwC$*Kavz=A41g%sTeL7rB!41q#pC@mR=A@ro zz|9HhoA=8zqHTG|IA3-PePaRYD(KJ)=S)EZO@DiP4idSNZN>l0HKu6-)VY7 z0UO&Px+S@M*l&HatG7KCDD4?`7Q3kV|6Xy@cT0L0Nk16rts#91B>b@bXf!Q1#MwdkP49ep zI>T=R>ULMv!#sjN-3sz8q$ALH{qA7hUv^kLl2WE*^eA@Seeg5t(1EV>-hUE^pOZ%5 zL(Qzq{-~XCwCXmO1(F^Sm5KQs1tJsI?Tu3$z=&H>$3X)oBpR>YeL(v-D26$AekPlM ziMts|aVE~7G$B2EAjceO9b7l7iS$DTy!^du_gxWvSYCy(vm4NHJ!`c*brgLL-_vcs z_ZA$|nW#5NNJs2Hdt5_Pe8B4%wu!r^Fyx+YdcZ(%4aDA5&TY{8gPndcEdQEs!$*0> z2TdAcLDoHM^Kgf7p#AhTEevgdD^}*q=T0XBC+RyeFSe=RZll6YuQ=Yho82Ig$ot6q z9^KaaI{<|hZdvet?E-WM88^K(Sn%)8O`ihp)oS3?G)_WAi@OwNPsO5UzPKNWs$Qrm z?8_{{>%xu8shb=beCzs^t5kzo$hkhEX0NInQvPsjMqj}j={|mtwb@fxu@ETP%+InM zyYgSI3vJ_gv1$?vibrKfW=jpwNDsfp@GD}zqOKF%QcC1%|8o8}&#tUZ8gIU{>8acb zmm@DhdTNWk)~C*atjdkXmwSB?XVb%F55jLxef~y;L%$>XvrFoZprJC}K38?FN=^mF zd(t|Tj30ncy12R9Z=S)E&6xziUqAvjW^sHr)X( zZfNNJtu(=aGw2SV#gMvo48j>|W?vdD(HD*5Z#D+P(Kq27oAS6=pz&ni5}$4yUeXp0 z{<#!@5?L)jQeMsiLdqOppGqdidBQT5yGPi6VkO z2NyXf?ljzE1G@I}oj6aM36dTE$^F?Ejq)=0`1mD7LHFoOPvrxPAf?AIZ|mJQFkn)y zit20-U_Q(c_Z>)sc-C_lD}H;Uh^Vp83)JDj+C%Vg1d%4=Qy8IjXXE@x4?b{^)KUyzx| z2mrJh@j8Y@W}tVqJmMb9WmvVMA#+018Hh^EUaMhFh5M*vIhf@l@waPFx}M%QMiMMX z+c)^b!1mi(uhT+Qf%adfF=_Q$_|eLrdii}R9%WccHuH1?_uDC{Vhw}9p%}U5^(0^9 z8Tjf=zDPbCKK#i!HZKpad@|#wbZ~-Q19y6L)y$B|_=djjcPn(6&1HD$vmes$mN}M9 zcz#}$z2KUGhWOUwm!se94Dh1Gv!gH0orY#^85V9SDewjN+f1F?mGH@n{0I)z0*^~& z_bOk#2|s^fJe9AR2k$X36#u?j4hG#HR1a2^Kth0dME}?x4> zSu)o}<_*X@7`dM%_XgxVOTM$inSaL@F89D=j-n%8{EGM>&3pg*{%7z}A%AMYZBbAy zckv5mc0y-O?*`tZ@J2PNVI^N45nQi6)_tzQeyC=`e3psG;|*`wqyKa~6v}z_tKO_p zgM$x=9(H61&NDnQzgp=8vv;dLkxIM*byRJ4SKPe*edPy2F!YksrbP0aHa2 zh#!rVvh`Ff;85qyE`rHohn zk^QSMEo%2G*gBx-fUHO&P|BV;DZQu%j#(`qAiU!6zE2|+;k(5&FVisLVZ=W1y^@ue zL%_|~s@lueo`7>Yanq*&>u-vkCOF@?@$(Tqf*VKz5B>6h zVl8vYo*r_rD$$hJ+0z*Js}(X1=GcPfFsiSYhhl+Uh2sg!kXW!@xpDpv3IbvZm%G0w z5Pa=$$>szKdAy`wud%yc4Nre6HNTktyR&M_f2un~9^Uw_70PwF0nUvjNwJC%zqeRU zcZN1yf!o+>hHe=&!l-bc9;5X0_%EPjiS;9RaZY(xqKZAR(s=dF8`;H}{j7_}?KtAQ zapvcXmP&ot$X9$|lA{8bTYAk9D=SdJan)$3xeC&*`<~e!5{Me>VwSBX?7^%M?NgAQ zkAwOf=Wj0uVmGGl{bNT0;GKXr6BqS!sAWoajF#3L6tO%ybY&_QGmVB%v`UrY42DsB z&Xga=Ww#yqBpeGy&hO=1y%vUa@>3ZsButQ*d)u3m`w2MPbfOOKJPZGD-_VS>eHQ9+ z2lm;8C&8Kg!cU?+sYJhWc}-2H8ZNm#j#E=?fm@D@?)XvH1k-hnba6@~!*P%KHHtMv*QpjI5umv^!8k)LH7EnJ`@N0-nHV%kZiGJ-G3ya+%tC%8Ekl^FT z+oBCZ0S(m|E1tuyQ0#HBt?H|5cxU9l5YES!;D>|hI>wKJ(bR@~-Fd4JFdCJ-bbmw| z{f%XQniP@_f4G+3VW-Z5){iee{O(_ZdHJ+d)+va-a{Oq#u%R?GJwWmJ^))F-^K)_F z@it>TVX@IT!|#K|TKVarZX!k_cP^gO%))KEmfxOUI*a2!{i?^3$d1E!g$)0!E5+hF9G;gS)Ez@y3b7Mzj}*Xn4fUQ5UPo|U zjHR3Tn;+mC3`W8uz!2dmXNi902$Qc}g*F=!zz)~@5|kGZp01#jjkDw73(2%Hal3tQgb$*B_!Fy^FL>oK)!P)24Ch@d-tF&F2CP!_Zp|Z> zz=BOhIBu&B{Kn(E=@rL;t1g?qTRi*6R9oRQH!^7Bpmh7;j*_$KHoY~FS9fL8r+{Vr z4{dV3@ftnvi_r`kmF92WOc1)!8)x#ew0y zk$!q2bws}PoLQl^3!c9FJnjuKw~XX@yo;~CjBB2O83sOYeA9t2V1*Zhe9G}N`>hgi zau&D24OfB>nWS;^j3zHKyW^*T4n|^YUSSaXrV4)6&ENe|hB!U4-fF(}q8JIeysJ8< zO56eZehr8|PlvIuivDTR^;Ii1yq*EOzXnHoE`m z&CW>UVz^sSd$#bD8*J!f-BS81885cIT{Ejl#}A`uXlcyLu!-nX%7FK^IO}*z$Jmxi z9Aajc$Q_l1C%%16^L0tbC##t{YM)r35AN%qqyA+;-GcUk1IYzgf1`+H3kT7!whocd zep?RX+%&zt$G!%{IPR1 zUEPmwnNa(c&Elog5qMZ^v~yp>X%xPeS;byu4kG=lS{FtNvCypWcV(_)Vn; z$5kGjetYH=x=-nJy<5&4bf%l@Q8C>-E?E zS#iSWOz6bZ$Bdw}opEkUi4knOyRKFnXpUR@?l1IagyO{4nS8~moTaB;SDS-I9GUvZp}iXUSX_nKvNsVB~(5+#8VdEcwpnam@d#8SR7tzDz;a|0UtP z*Nf79Rt_jH++tjh-2=6!6B1deFqE2dB|K~|^ysT&FoR3eE6JMx z&wLD+R1=AXRoX@~uLOf|z9u=a~#upJ2hvFHZeoNa2KIw>!4D z=m9uFD@%83iXLVaGkPB|A^H--DtnE}Y;iolzny?n2J(DO^XC0kGn~Vj#O?py9dBSt zQQPymNVVa2K}eM$cIEC;x#>yl%L?7kG9~1o9#8&=&|BUx$=V0eM@m<6NM^TQiP<(pn{$lS=C5nrzh zsJtc1%e1^e7xLgH{9sVi&tZx1*1*w)|9lEis@~PmTv`IKT1V7#^r-=<&%K*mL~O>S zy2<$5;(B+sm;#sO#wVLS1^tVW|K(>Za>f74 zb=Fl4kdogTqmj zkAj~I!@TiVH@BZohUEo7*LsW+a{A6nzPNn=Ca&{6sB*cEbENLP=16P7;k-YWTTj*C z(*65JM8*oS?I^q1?(P~m;3UE$<>7>k7_K_}j>&|KCl(|_zF&Zv{9?utj<#su#QRxl zp?oMSJ6AQZ9tR_HatNlR2VlQDI*<{I+Y|ZgV{HpZpB%-dB^wqSDG@k4 zpj~XYbs*a6-4u=qXRUV-qp^XE6)-p^L1d8AU{%(iNn4R5EGGCY4c)aT@{r{pzV#vOx{iGM%azGEdY7P*Qv(oWBY;H)ca z6F*(-5l7ook%G%wK%_)h|2e^_Gq$@VGFf4d?E{`uHEw7@8jifvLm#41VK~(@>PcS^ zD;V;{zcLO6@$f~Pc$?!dcMqSUojDB6Y%2Cs$2ueXaX+C~)Xt#SapV0Z2})$%c}FFc z*yHXBj2E>p_Jpn{E=e9@GQ+4ZVag;ki!WO(2oj()34SFVV$w%YXFyxe>4(o0FS(b&&kg#jw1oHG;=9=+ZQlg&>mrMqeR42P zTy>I3Y`+9OJlr&dY4U(}g#H6N?nE3BR}>V-SqmRXUOT3p=LXngBwCzyCgZc!^^asI zF2E%3v*Y^|0+FUi&AXJzDBSul*Kk}X34A$pIzHsA1DbN-d7?}ifDg$F2;3DujjiKa z=JCx~NP5Ld-!183B>iBdw}$j7keqLlKTC2ENnRJpks_Lmr~0RlXrFTocN7PJ z7)@W^zg5=IW=BQ*5xroj4(1J?{qn&BbKIVv`Vuhzv!W$so@A&Un-nmc$qB4}_{dEVuy*Sl2`_ClDF?_^;F5X=V-t!o^Op?S~aqaJqo? zuYLhPH1E1(NI$3#^FJrcr)=Yc8!D_SNwi^T$IH40*DotUKe@-+lWjimiDl?=M@1Bh z-I)~3Hxr7DS1;Dk`o_a_@0Um$M80zIc|Sjz(LH54hW&0z926Mit9;v2Ef zre0~Bf!gaZHjh9SYSux4*K`Yr{XAScbL+Z%$wj>d(1r;E#C0JA3Z)$yxaFc z3RM2`olUqz0V$Zt+Vf-JK?@+S0(x~qR*}N_2kk? zZp`1j9asu@L)}c@owp>JV2w_u(8ZNwXeP;OseeQnGJkUa{E zbm4wnqM_KDyE`0($!x8@(I0|%$G=kRWD}mR{_kGrww?v!t9nPib6*5)`{j?iyeR_> zG0rSAExBOUcf0#H3peDJ_<)-$EC9Z?e=8)iyB>Z2QYLsOEEG()RLTEIiGhLYzf-B% zT2Q7UYr5MV9Yi&la#Wcn0(u6zTRh#{jBc|k@%aQ<1BwLIVHYbe$o)>deIxi1x+wbj z8_i(@c=)8_*c!DeW?~c?$Z=JJN#e#ap6SsU+7A^IbvSDf_Sk{(9V z4@P=xNS^}9`6l_ZBo~q7b&(tyl5arvZpl6v*;63%vt+J|%o~t*FmgXj?hVL!mV9S@ zc&PmILvBL_qmg5MVosRi8qcnWW11Lct{n5S(S?i*+QMa%$I;U{KF8CM65yA)Wz=e` z4`BM06jx8&tqTkTOtY8_LDk4B-EIE<#Jy^b8CnuN{vb6W`%p zIn+G!3LjqHmN)#tgIZWATdi$mz>S;RH*;NZR!}C9$QxkmcJIwQ7&Hp__yWmBf-6Nn zcE9ouBaMX^XHNG$D91B?^7$o0aIzkW`XAX3=e5@^%hUeur1y~EOMi~F%;RRdD)ccokvV(0zN(??(Ms@W6~I1vr&=df;aW=BJ8v4_bq#_ zJh-t}`1-}9c(ki$ujSxV1(4;5zBpwhf<=$#w})QrOvnuhlAQH{XpvO99$rujPg=0hCLl)z)Q)W%(SJGfauN&MeowuFsVqgi!>ioU>wcq)L60+mwD`U4FG3B^pZbbnUf z?Q<@%kCZ&Y_2X(Ju)k#UNi3rjT@|~iSj**%=PJyZxEDgf=TI?t&&LYq99|ORrGt2D zo2mS{VP_B?+LNXI#}U=+`nccV?-|rJ+4uC0i9hb?US9bgq6JB>IO)43J&dFujP%x! zJ_VBVP4Z_+E+Wb6A~`Z7-+=7hl6^3;r$FXs$y^tiHz4m|^fA>kDmudePy>ZM9v&*dX0Iq=+G#0YCO^3+O2$Ia+TQt9@@eZQ%cXhI5pRpsbD^ts#5)K2l!u76YeDR=W zR0eQ6#sh}D{M)**(cqY7w(q{fypUZ!w7@mR5h>HZaSo!30e@>MeLsj$L!r<2c^+B> zp^Epp-RHWG1DQUd>Q>^r{KJZi1NE6C)VM0C7C{$`I)AF)?%1h=$3D`oI-vwKs~z}( z;n-p)|Chi0JTz0CE0G*MW=sYE^nd)8XH5rc@AU8LE)@eq2bjk1`~U7tPtd!wD#`{A z3rc3Bj(UM%R_WIYZnj`*nq_D6UuK|PbG2a859Vi7@}IZH^S$pLql{!AtYg5_;GPb? z2P!<&t}sA{7HT)Sh;V9#f0NhM>G0D=iQvd|PBEn3ylM|nV&HoVPSs4G$Z4Z&_!p{UxZRQPd6oXWk4n+{+?7J)QG%FTi`3*G6Dh#Gy zjIxkLef_LqgkLtUeV}aMGj^wm^-ta8i#nE zuylv+jzqPsItm}#!mvq^`ke|lZNxrSZD*%n43(39Bp%7~hxXhmhvlbS0lz>%0oTq* zP}gdlblo}*y*KC0zfyb#)Q_)DQKv| zN7HcU&F@=Z`bFS@cY^^ocQRn$yY9xDEWV(EVQJ#yf8RJ&KTDM#`t+H3_6O)>2+)s`CVC~zWv=wDEE7RntK zQV5*Z!masEQM9*{K$dZK?|bV&y#1P8z@wa9So3lBFw1re#GZV0n1{_5nKX+UYdwiT zKLbw3X7fkjhNt{i%TgNHGsx3RbX^OOUUAZQOL`bdKN#t)A$%{O5w!XkDn&> zvVb$M41-On3252f&g$Q%fYts`H4Cz3g2cJh_1HjrShN4qXkj81W*52{sq7|3?1S&V z$v$L(jICD|Wse;K8a)X%Mteh`mfY3I*uHJJr+WH|hp!D1c<0MNVQGY{rJJ5;dP{-+ zjqU$=7?%&+_|FfP-d-b&*W%C`4O4;Pt7yPrYM-_-pi1xyz$Skdw*<6oauJE&+y7(; zUKdP$?CdeWstkPQ_dbgDUREfiym})o3Lw{GbVXM4%qYI>(+JU1z^QBFoB3HxH)Xw< z>%znC=gk##y}_;UcO$&h5y)O#UySR5B66(XJ*Kwrm%@94shgYgZ0qQrW1H`+$?fXK zKT%QeW!Y8PimGv?Rtw|#R%tC1lf%baanTlV_ElE;&Pszn@nvr%Lbij3-6x)P-`ECb zri=Mw?ekF!g{6Z(JhgK=brh zEUH^qo^-0u1&YgWZwqJp;v{3!ds>~@Xl_}Z{a<)G(BE6X{*2!i_eo{AN(#gysi40- zdFDnK^vQqQ@j44HNCPc3_Y|RO~HZ( zwRSBj90NwrKNf#v$OGkF`y-guV$rPP!*`J}vC!&ug5Iid0o>QO^3K529rs=dxt}l+ z1`OWjcS-!P2O`}zCnCM#yW+I$?p zemI=)SG+hD0bBs*tMd8UE2`^JQ{9i9Cg=bB?`{ zLNB%T&nBF*1af+XpMwKTFlTJz9)4NE%d5zku}?S%wTjWROo$t!8#xy}*j_|~02-D0 zI2r}KW!Y|r{<#xSJbkL*-)RGUsN+zZK+rBwr?DX;&S#HyZQsAQ&65>KZrwLUEnp93 zl)F67$2+2)YQLoN53Z<>&*_OZw=Iz?^q1v&oCc0a=kYezmjSU`w~B0=DnUcu-`WA5 za=^`&e&bt92{36^JhzwA4;qgfU9k@>K}P-ulH(dHz?|9j`^gjuaQpa_``-#;ehte! zyIxrgs;hNP{7xprlcG--CXEQ+cxh{_yw*8T{Vi?2W-=E3Su0+M-$~@v`YZ-d-PM3o z&ly@Yt_I+Dm!v;*rnsRQ@qKcRs%j|DCRycMnm2O4@9H%9)r#QYw^1}`+T(3S4`ajb z6aL1tf_GKjO#tZ?Cw;f1hmrJy`TfsZL;4g*&Ns=QCAo+suZ!f!kbDEOcT4uc$ese3 zpCxl$WZrPT33EQJO}zjR~Fho6u>a?PANSxJHTABl>Kx%2gq+psQnqe)8^gd>R`mia^WK0ri8epLjUvKepkNBvFTyFmR0$tkKYA} z?S3n+K_s5WJM&_W3pk+9?58$4-|)KWKwgeLwQv)-rO`2NJ)pD7ITFh{PI# zv(L}JyZhxRkk}t*@9eLQnD#ku_QBw)vi@dI0gs^n^0V;Yu5-T)G|qrss`qVY3y^+1*px+H4mj$0(f;gKMFDwU@r!%TfqjqO9$k&D1Phjuqh;Kc z!2Z;(!aE!V;BL~}QH|1MRGJ+ecVHnBKIx(V^<%*Ul-_-L?&e|X*9w8k=9(t*yW_ysu63?c+GntG;sJN|J=)N@_G{88e+WwW z(`w2Z;)k}c>Ix-V#sTFid@klp2)345cV5Yqf!#k`-sTg1-CdFnC^#|#{k@|fA9XPU zY-hyD`WIbbMn-V=t_U}rUO@qc#Rx9W!uIYum%EP~_tW^fV5pS~x7as=!nJ2nM%^`#@^3t~40oizXvXNur_HKXh zp<3xJy_5qioK$UVrZmAr!{NUQ`pgi;t1gWX`<+3C|BmbIKO{l&&NcbJ4RP>;DD)o{ z4aM}yPH?|@DC#Y%qg9CW2Y*fV!O9LdK<#lW*6W}T6q{jY{Z8qFZ9lMnuX?EotvJrx zzqOP?Ueo1KcRx9zez%@p$6R^fB{G%wj9C{6qy+LmIK+Xkfa*MfEBo-<0gmrJb~=#W z;~4czHD{P?6ZcV|Dh7J)RXa7UkOBGq*=Kh#X2TQn74P>oIl_fcHfBVw0W2M_9j9{0 zf%azMdd#EVP(1AwAp>;-J#GJ_;FogPOPST<|IY*7+4dsxzKzFb13zu@ z9u>N<+$sM(OV(?<4!GfB)XY-r0y3YT_|L<*K$ZBPAB>ft3~YL9Qr~9`9(I*QV;gZ( ziKaxq!@O6y>yA9I-}-Qqi-`Snk4_W3F6euC_yNI@Q7XSJXGic2I(hkoOV8>Xpcb}- zn(HTZ(9l1b@kkFXa6zcgmRr? zVzzc7a$)ZlbFRC9&eV-uKzKw2SEvnF`m-PdSNmUm9<*-T|EId-kZM+T8xJ~!+5--F6QFmv24wQM$x!m?=PGzR6uSe zusv4F0u(->@cP*w0()*!#%UyG;7WX1q41RpZWjOb%CO7Z+wMU6un?)y~369V@l9fsN~k2-F`4-kHN|D zhy?gdcv)`$$wb(|RQ=%UaXqNLS}YVs+`ruYdxv|>`OqZk7)GlhXX;u?wNo{;vHO6JI?l;A_F(R__3ku z7!D7fUQ6d|2|$Crk@oZZeSnhu53`312N9KuBKPA??-5d2H8{oi@-zDJoI|m*|18P?$fGiHmafdY8o%>ja@$< z5y&L`iG%fKO{3q!0Pkv#TR?~@PS|+vcXT2Yv+q~)`O)JFnKwc{)jW*@cM9uEsd)(w zb_aW|q^d8LSxEjEpBs%Gcqsi8KSsmtT90=%aD)KTD^B`uNe?6G2P3^Tq)&n5e3SfH zl8Z?4x=4-;$u}T-w`3oT>?x4>Su)o}<_*X@7`dM%_XgxVOTM$t95K&o|FlC(N$JX_ zi}F~W(+Q`kxM5FsPor5yM`&_TQbuh?8s|Ox#-n^o8r$y>F})|t0@;EyuUwVVgpC#f zmjxM2Veg+Ep-H+YAT7Sp6#1L@E}$#A8(EbN4F+G+2CWz(mN%bwrJdJ-EN=$IDJ6>G z%U@gcEEMcf#7HtQS}=h9S6Bxh1?IyDiwdp`0e|$i^5k=GKUqA%5YP7eZW^?iPF6VsR$46&!Tw`Xh5ROY_`s*bX4;iTm{#UT>#pRrj+7>!g3!_s zF#ZD_-1OawsPz2jVT@pt{m&03HJIH@!7T|M^}9b9i%>-VOL0~|bFEQ*!22MAKMSV? zjW@Z7@X#sReuCH4$?-Pm0Kt)gGgtVU8ZR;;s{`#@Ceo!qmP56@Dv^&taeCK?o&wx! zk2mwP&{eT~GuMT~Q)e@Xya5zAL?qB^)xp^(b{`#@bWy-#^L+|jQsB;@($^wuW1w8L zzPkC&QZ1P)pSqEO=SP~yM<}<$=W4r7S$*P%e%{|LPVW`LG~bt7R}-bM zyf4pAh`48U{fhOp4@-ng^v(BYg$r?1rc{8)HNtz5Ilu|>{Rob>O3p)r41B*b(K}r? z3=N6Yys)sefjcE?2H(4eV!*t=-t=K2qBM37R5k{@U?~y{ps(Lk9EZ zm&EgMZe63~)fe%2M?*Q|jWj>F_`Q)*Ni7lFxL@a?K=jEqxLSRV$R%QC$35uu(I^~e zzp%IJffDY0WO~Lm$s2Kr^AB7%bwsau@=e;rPk>ksI>QdDRGeI@^InQ08%~#pkNlxb zgqbn4SNA(bzzIGvM$^~;ywUq~r~c9zJjS)kxBiy!PdZ7yVv07$*Yt-%ZrruU!Rx+9 z*|WAHrkUisCr&y+aPWB9VQoLmNnu-ZE&UjJfA(@nRZ2L#vf_^FEe~KVs_Vb0KA%F< z0rN~7>JdE*xFh#dlQVs$yH znu8sI|8|_%ySf!J_=pA3mHT3M@!q0yZZ1%n^VQZC0cG6&jyb{M-IP*D)wUBsa8jvH zf_0^+^&tNIiNaKATpfS$aFbjbvc}hygq8+#U9dz_+>+HxXG}j*nZLu8`12sm8;{&F zpg%CTCv)BbUt&F=mP{LoU#mw%@7!Jm*jY;^15Wy3egpa+D@~o$XajtBy%&hi_ksJ1as}?6mCcE*ol!=eF|A zBxX`zwnsm&3^Y>U9yN)+iT9NF=@Y#pvHN!6s~6wcH3bXca~G{y=HIDefp=nUcRriq zV@x~r?&}@HV(XXJSoFiuqmPAW_^$E8zkj71n&pVT6=PS<>T)<@7P^08?yEL@UHLV8 za=;AB2EQ&oS8I<{q&xzEh6DW4=wAQ7KJO7`D zF@YjqD~akLpfJz>#g``tJV60&`>P2=Y@Krm&Nn{h^kkDi3qv$LHo1rdui=v$!Rx{T zE}kX~L)_rdg5?-h`*tuQZThCuAP%tHwc6~1!Pfjs%0y2A%Q?N?%+KPMTlSl|E_~iG zBq3I54}!U~sh3S;5%0q=#|ERn3UAy#)eCgFAWp7t|IM@g!IWd0@2vjhXsV+}6zp+w zqPgis3D=x>>xQGL;EG%R_^<9uKiza@vyGrn(vldj=GH#eji1uOJg#_Sjnm_Y-H4pC!7%y#;I)S;p**5Cu zGlBc6N2#ZCBvw;7G1=%BgxC&!`LGmb3?y0wKfJyik0)FYf9;Sjgrx_{cRRo+!s{I& zKGdE?_&yI*`CSPFKXu<8ERK`MZ*}yCbvVNy)%}oZXF7Y>%%<}8kzO2lN&WSk^q4mq z+>*q9#MlD8?RWViPxP6Xc6|MKQ#cTM=!H=q7>>ppi~|C@Dg$t#h{i9YPsibVuSVgX zT5~wG$EbYTTpjY!_a#_G$l+fH8zy3gkHFiDD&}9xbW#7rH?v9>4Xhsg{+}G>QOMq& z99G`vfa1juECf8a#QnwncjrwFan^?$R|=0fA#jM9ZR>;{=Ht?H3%+ECDoREP6}io>IC5bp5L=LvOAC%a5#l$^9-5tt;(LwN$eqXl<8Z;HpWXVPn&nG!Mg-?h8mtWn^}cSb+qDu0;Lw9lmeR1PkjARMx@ zoa9Pv^L-%iL`Bd5+|*W|WCM+Z+rvE=Tze_4{jY2H8WN*R}Mm(=3-SM-XQ zrt51bIHI<=g91wip=l=`^2T*=rAyKOk%+CixXs<;5*&PDvS1g`*Z6Q zxQZUzJCWTJJph^cFAK*V{~`5Y@cDOicNbpGlofsTLK~IbHH#9wa|7vZi1scY;(@)3 z_tRQ`3&OAN!UI#Z%-B?CcWq6OHu~@OpH+>Y_GtX>cpUiwC7cJMw!Pv-s*9FezFW$2 zF7XsTW>E9hI*jiD7ofY5)jsVj1M~!=&XRlz&0?kBTS%O5EOJ2Ig~Xq2{@V<;xQH(IvC`6LK4;gcTAz=xHLFk|5caNyOm0<{Ik^u!v=FeiPTfT#yi%x^0RP4 zDp;t8k_QcDPJSMllLSFV3?j>%+(3-sOtVSgw&Mpn3An08d;2Tx0gjYJn+wYacVDV!` z>w;^hh|G<3{YRnL1|(e@_3qmKsj#dy=V#c4vBFaLsLBL-**?36tmdF+^5cM=v?i%H zA>3>JoCyzB#9dOz_$^gHVQh1DFC#VzR=UtS5dc0sH`b3KeZR%DoEjZZ0C@L75y{DP z34EiHp)Iuh7_R-?7~xBQ6DSV7O(&zVL^Mv*sUIkIqaG`7(@#Aj(Dlf#B$ND?&8a5e zov$jyy`7V2i(8)2l4y9rP~+9L=B%&Qpg&hU zXXT|a<`_A2TCmau7;NgdVTP%ygsJ>s0r$j;Q#$a z%o(FktMaWiPXF%}Cw#Yrhmr7u5#AcYr$BJN3H~g>MI?A#1V@J88xXx)q7O#&6o~vR zk?SJz2E-kV*v}Gs17e;f-q|nr+G8859zp3d^2N^r$zbrt;)M(yGRQXe&`MHhwYm4h zy8~W7nel-`QOjo^p289@P~{aNI%t|GAh(N3A08-t^QBH(09QWCky>+iM{IoDQEWVC zpe1L6`e_OZ+@XDD>0YWA;tp@@dQ!{`(~^6?+n&;f=1J0j)tekpM3hJ-)1T|u^;Y%A z{QqnqCpEowf6*0m&5UZt^DAyxzI~6+I>);v33HzLDt~U^`m^-GdC`B(e~M2UnlkLh z5BjC{xem@q_5QxLQ;&uVtTzZeuMwg_8NFU^WZD~2oh`m9sry8+G)LhsZUH(t$MP?~ zk@RwZ+*Er*d(0du-1y`(L2C?a`Q7CDN!;<4#b-@hUU9)|oXEE11Y*&m`LE?NjGduNOBk^aODMaaLkEWbKgAGRomM#jR z4jSW$pkgy{)4q9@#5chGM-FfGZsESU>sx&=%xv7^OzJ7XR~mi7M|=bT(`*TBZR14- zv2}xIzjA;G)ps^Z+(PKV`yTqO{cQir@9n*Tq(sWyt$9|5{#oYMJL{JT*38dxLfLsk z?O?Y%aK!5rzXNv$tW~MqAxFBG7=1pX#(F3a4lM9GdtFYzBj?k|Ol1vG>G)#NbWuFy z{7kL2s_ldS49oC8w7&r!moR_(r4b7GR(5>0F|xvYl2?+WN^?}7MkL~UisIsQcX?s-&F3-gizgxXTrHHcqlarnrR0pH zyZ^l$p&cwF1QN%yr4$!S1!Q&O)T{o6*HW1RJb~`6pQPA7v+nz%W>TX^0 zH(Ss~2gJvZ(HAR&pYpww=d+yH*ZY9{{5?*5M*Fy!&WQmjHi`+K=b|Uz*-!1+uP^-Ax)67nI+l}>_~<~k=2wLo5hq{A%=M)+iMuA6a;9hzhO&)C$y3)YVP7R+gM z0m-v_Xnt(mM9EpK?Ab>j< zx8uxRH;zHVD^B=s2@fOT2P3>SginFsd=vaxf{RG-x(JR8!8ag!w?rR|=qV8SSt8d( z&2}cX_Qhy`qygq5E)*<<}l*#kQYD zKO@&6SxAV>pS}<9*)Owytuk-mTivDf=AVgp@pR|#<2!L+oZ>=aj&&lIKiRW<gg(=`QF z39`$kyA*NvBdxl&IWrjkGA>F|!U9`GzZB<4^+imj1s@IE{lM*0=XE-iLeVM{!xYKa z2kY%FmytYDc&Q?8%Ws0+{FJu1`!G<_B%Rzq0m#4O5p&sR2zH!0Sx~BBjgAB*ZE>9- zwJh}(Zv?Nluy6I?v1)0=RzDMJG(3$QB;CtI*tx5wUR^-JTHP+y3r0Y0R|iW~zYV&1 z+LwLnE{H6;tncTf&H zQYAp0xZ;0z?pDI#JsUe){zzjspD%|tBlS?FKs?j6>&GxR2W5i9RT*4*jAh*6m<5V8 zuxz?eu8n;?!?pP%qwvf}cZGXejwt6Sm!~L28Vm`c>}oC60Tf?1vkKQ#k#X#yk)yNH z*l%ag%F-_;*!1MpOhvgcl-(txIz4Ivsw})stv1w9t8kR6p^z!|PW_Elh9 zqHk?|(i-nDGNQN?+6#vVWmnV>IRfF~rBL$jdK~wAhho*_Fr+KLUURk89qjlh!Y=)< z1;0J}z_#Ux6>!cTc_9SOBWfp!h?n(A_+r~>BL!H1y>3$94w1e!rZX0<48#OAv z$wgDdq(9TJ?A3|Ca>=keeILL(Wkskz29CoW|K9HF(y;_GDr<}f_gJ945(nf)PYD1C zZOIoOdfVVfwu@u&UmCGd=t!1|-b<_)nPhF8*9R|-?Av~4ublmPQ^l7Pzl{5Pns0qb zin7-fURFE-V?_sim-;W_{wtn(6`kR*DcIfTbWa^rymo4)&1V;Er}k*x^Xv*ZJvDCs zRm~1tUA+5s{`5IKcW2Lgy=n{8-FD#I;V)NUqs~4Lv6I@^hWwRh-2+#2{I1ifGr}#< z>Uq?+ISM1JC#GX@;F}c+Jl`4KozewwTsUoZJJ$s$C!~ z@hQyPzGnSlH$Ci|-(}!!ssSRS9$D|2p+>4xVr$&LEpQ2$F@fUrF05-RBPn?CHDeTD_w3|6XyzcT0E} z2|pO&ts#601m~OJ&k|fjg4acGWC*?i(Yq!3U_?)W$j=hFE+TJ0+`)+bEU`Br=2_yM zwWA&%Gi+v*Hh!^R<>X^JY43n*NsFs{q*V%EyWYrLhsyT)a&=1|pop{eH9M;}u;q;6 z7prH9c;%|4*N4_@z%Tx>y;>|0=UtQ7Y(?QXh1x)9>fLP;_x0XGo}ozW%3G-ycONTBu@9s5&_3hpkHIk&? zKZt5yFRLl~tRBk|of`n!)|d0wghLU7(5wK-iwb`}wdN=J`Y?K8xaE<;F$Xxd{3bB# z$MK(xHnLznC;WX=nGpy%SN5Q4(H*u8{KaN+03M7?W~Gf+4BhBZF~M| z)cGYQ<0+whP}K){wG_{8&9k-rF(7$^efRp-FJ?9Nck-BZxuu>iE`uYWGmWkcdt zCI%#2Plt)G$|{*(i^9ho z!y9yrYIIaOsMe!D+l5*md`vvv$xD3=Mv?`tk{T`WjP*6@`n^NVEY!|#XnHOp;lt7@ z59zc(qU9*n+sa^QMSF@;xLE`;?qtk9dgcJ0P}Z?4OXWv{KjXxAJ+J{YqMsr?e)Yh& z76;{oHx=RL`bRaZ5PodoaSSYUae~1cJ$!M)7XaZECw#a6?_vBuKN#VyA$$r1=bPZq z5?n-r*F|t-2)+T)yCwQyL{EXp&l0&VB5y$4!HE4Vu{R**S>m0YJ#cNc`1C&M-(1Dy zR$3I&4kt&M>PLUUg}D7Tm>a2KXMY0p_KK7VX&+HTm zw)`@He3E_%pqd5xeBi%5AFya-QqgIuWt`;@kTIn6rbslPueKRu7mf2?`1%Lzp+cL^BQT+ zR2L5vcK{AKf0ws%meI5H3tRag;lYWut-G{^_0aac6FhA<}y)9f9TAuqd#;KfwOLV^Hng*DI_wkYAM%g8BsX!lE9KVS81U*~yOC*5S z85Ve}P!b;fC(;mhw*6Q&T-InFaa0n_){yb1nj z2_6{TA66G{!y(^}9$8FO0!7UOC)#@kBqc5{Ii!l@Qa8H2rld=T=fTwp*;J9CaGP^DGm5P;&}~_>teLUr`1v!N-K|4}5ES zHy3n9VOkb+Ye$RhjBbZ48YA_^B2C!fg2qc_-d;RVY(hPv{2E#$b#3nr^ujH-=UE2_ zx$SpWdqZ?&@5gJ9H9O#DvBxdg(fGK%Uj8mL4aFHWwr$Y)nxk^J5P&g3E;?T4Jdk;H z`46e4MCdbWp?Glo3QX?bA(nC48GOvJ<$$91q20l>2`LT>mZ*U)6 z_)|&I;a~-wi&lnIwLHP59e1r{&l}kBX;8=iRygYNqbU{f2nIF|Wfw(+JRqCAO0an5 zCCDQBC1p7HC>~tbxftDQi@M3k_HCTF25d54_lR}m!g9%29*SNQ=)`m5I+N>p+^!-q z#?f{JeNc;G7mvPx9y1GmN-OM#?rrz=$a=kC_MJN~zxgX*m9wJOdL0JfhKr!-^HXl% zMsxhYX1x>1EAqcyal&^?co+#k7~!oUdW1Y&9NoXoLg3qk80A4$^V)!s76=t}8=}@pNMAfc;3%RN8z}#_xE3a+C z;pqd+H&*?UQ2&Ivl~HmU^sSoKyDsexKR28mS3GQq7*@DDH$KO~hdU^0YRxa8$!aE9 zsuzh!w61D{MY0GleATRHdgly{8xA|ilE>oqq?5j<+TBV06w&Je<{3z}_5I$9@>w|Y zrj4z(vMK%>eC9AqpgrbGuTWY!;SQ!24m(J(#h|=;j);8gFkIjGL3@JPgXFR4=MC_3 zgvO7a>~SK`1QS99`-*&WP~Y*-W$oWnL9hMm`N&*9Y{QDT{L_%`(%~&HDx}c)Ye(|+ z;eX~uN?g9ypvrbZvL-nc`7#KL47CJ+j=e!!JZLQR&3%hg3ely8Eq)%9RuJ9dI^j=a z|2^D&+685d+A03R#;A2*#{D`BLT6raWyM<#iQ&q-BxT4iz1o##?*;}}e;1lb`9S5Ovz96+^KtWsTYO;xfgq9n z{Vaul9sFD+!Vynqi5i;>F}I{K@b7-?ts-FzTWd9-d4?+Hyr;wbgLLQDmXUD6*DaCW z2lq(*h3mkGBxB@;|7JA3*1p)zHeo*D%cE zcxRpA2>gETO36-^0a(I3DMG*g5b{55O#Dnq@ z=v9g*s&}hun%3eEOTVs}7e0pTQIcL>hhtID?rHy5)*hhChJK$9^-G+1YTpyCox_-E zZ9PbE?+k3r6KGwt3PFF&_n5-XFrc0FIwW@AHDKh)8fSdJ4Q>pFx1IXkgnJ|2j9$p; z!TH1YXRnp_k@7vG+xuCG^q1Rv14F9e?RmDIDRBFpeZyLMA)O%yzI;Bcf8=T->CR}o z>-Dc!kTzK4Y+AN5tZ)8%NakfS${i+uyw+U~mkrMccJi3OSR0u*mTXgSRn)esZSE=T zp1QGgT+A6d{G5($Sx5pc>r}Jrsv|t~pTPg&FF&g@Ig}oToDDols;70+6=Wd~}x$>TfOQVp_>rOR~K{!k~ zK{I8mq=R?2r~iB6;Q%V#4hXvIM1Y6G>~&N|0butI296=I*#Gy66TVx*!$|nS2yYGH zQy@6s1b>#`A`-kVf+Iul4T#}QF+0Wr@K@2vS$s4r*x zI@Dw#xxuJ^z|YS>U}p6)9NXhed(FEHa-`qm2sM6;-fI*jJ(;YAw2GHL#VF=Lnew3! zUGM~@=>Jd@bhrn#*KNLsLP?Zfxl>k*AM$3mtWSpFk8EXv4b+Y>UuNl-vTF#S5gK_gSeJq{zrA&M zx<3{-%1TpL=O$oAosYdUz+MJ$tmH|fNsi;M%M9jQ= z=v-91H>^G_yXBvT3EysQc~P-+&BnS>$9j{!|LJ+Ju2fWBxNC2+W+Dh^d$Yye$LChQ z?j!M_A!T6D7N-5f^i@8nW z84{%KbwLBB{`y4|E0}nK|GYqjFK}Jqw)nSZ1?%19On#36>^@oY;6IrV0LIrdr4o~1 z*CFaf`v5448eyN)Fa{!&Vuv_`iz&Of%>V+#&OADd0}D}-a#*SUa3A@*jo@^P@d33oL) z^B?^ej1)Jf4MekCK_At|(t9g?@J{TGMW~KBJoh!h<;|%`ys0(aB+TIfqyHEiyiL%B z^6u0>HHJxZ?HR6<8cXAlv5h7^pL-6Ts?KgWp*soNpRU^)?C*f(VsCZo3OvxafyL|l z)T?k~;>FVSJaSrF^+QMz`q{82Bepya z{K??Bq&(h>Wyq*@J?0$77lv#&kNVC)qmTZd%id(5@IwUw<|L>lpI5MDv7{tex2xP! z@#HCtrMLc>#{3v7|0$Vqqw2y8U!)Gw%Jsn7cCYO_nByO}?fvXL%-r4^m~b|4&$GK| zN4MWu2Ug{Jo}muN&SRV{zk3MIMlbrhtFc24F=;LVt74R0VO?uo-Ux%VI$zwB)QA2F z)^&&W$0K)tS;H6qx?wFyG*05U0b_-XJ%slsfC3v9N`Avh$nyD#*G*?bJotU@blGSc z=%iVdi4dQMGb?u@=Em~T_ujouJ0vneM_%0XQ|{@I+lF(l%2f-_^jSEL8>-;PQTxL_ z<%gqh-V-HJ9?r<ewQ#MFk7~if0vH?Zz`SCPz!*=muZ(vc*EgSM};l_G^TYLBL6)}@;T5;JIS|YBdf4g zvRuVzAl{#}3J z8>B;{XrJWq+fN|NHK&I)Ila(pYENS2;p=!P1GXJJ>j$WRJsR526AUA%gf6Y+C*mC? z%deO_0>Ilv55oZSBAD>=3$>nQBaX~ciF|V<0klrN@f#NDglD|NKIYwbKzci6S{iI! zfb_hJfqRTUH1a8D-WM2#6_?7U)_B!mO})`=L2)~v{H0`GdAB*JpRJY2*;NLa)*8Pe z-&(x%T4pL^J_%Y2=#G4fi$_<(HofJlg23Ohzx8jj%|Mq)(URXgC0w2ECu_BmLhAJ$ z75U^72bo2AGj(`0v1%vnM5%Hg+{p2La{l5B3^=Z{c-s0c40uECeSUTZYTm65vPpjq zKbuVPE`2gZ>ATof2U}}!r0G<#ySR%2ns8;I=kBdj4J`7&_&DXH)F z{7G(AI_injQG-f}AUo);qmV#1mbj9to%CF{=a#HS55zW2aEXh zzx&zaCQrBb23_e6+w*KiuF3X0d#xqb$17qCatoyl?d#LPJMCW|_#sw+SPWuDvi>$g zIS(JkK^9lo*Cz3dN#7L7CYXlw(Fg_p?7;9CqI|1fy;xZ<~KuEr8t_6m~I(r!{~3t1t;jN zz?F;tOi_pZP?**XtO)N6bvAYa6ssXU z-4wUb3&B4IH-zJ1`Q8riG1IGf-JIl2q`!^{uQ=hmB|MCTAB^zU5IzNh^G)z)2`(bR z>moQZ1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c<%+Jo|s|tdxMK>5=Q7VCmc%y(|oc zVz#r|@(%-$J>4*iP)R;Uvm4)~2bn)}!sR7I;RXg+2>#;mt$T3|tZ$P1V+MGh;7BU+O z`FXxN2OHge=%3wdf`w(rdVXKB#QOGM8|Y?&0q@ayv!~YK$kDB`{EkL64h?y>ksq0e z`!X*2GOH!Q*2WA_bTJxqx!P%;G>Ag-Rr9~w4Qk+F@1c%+xj9&5pS{e>V+F8ey^x{% zN;=TlyYK2hw+hsj%$6cnRRBVo%=`S5({Mpx)0rY)AE=W<&U=hG1jfuJM%Q18z<&~K zRVjI{fMMF?)6ut*5uJY3CBw(TKzTWF%Oi!|RhPE>CNS#c`z`K1EcNi*;z8rFhse@33$6VG&T0WdG_ab z#+d*S?E(JVx$$b}n>dhuA@C`^X${Q!Es9T@ci^~v94xn+^1-mBF#BQ7 z=TPF>O40FUN3>af0PQ95IIGhS3UKVuhaJn&mkRPTakQ`m<)&{uTo|la=1SB@8Yd`& zy50o<<+Z(I$s|v&jAs3$rzpZ3Yc{;eYt>Mq+tgLcH5&ciW0Fa+8UP;Hya$WIq`llb z-XDpEG1z{d{LkJC@%U_e#?8XZ`LLv@p=*{u9uHa@=1(1Z1sw~_Y?vb_;KM+pA@P=3 zm@9cX&#ZF_rk7S6ut{lyu~Bcz&r<~fo}{|voe!$;G$Zeh{^}8ET%QoV?2-m#72SSc zN`8uuBw1*BrP#e) zFGD}I7VDX6IcWPlf*%AXI8KTtfR=?D??iHB(HHecn}RY=F{`;J%;oICUHV0bQUV4b z?S%Ug=X)lA@x5SCkwpaHQl$MG`ql}Z;W$*mbEz8c7rkn1`r$6dz02p8xa)BxVC2CGBT%m`sN_X>WkzX*M!Q^DLI*=^P=w zvoQ71+nw#DW|&`ELB38m0$IIy`{*_F#{6#p^TLZLQooq;)~I^|;u;(H@l(znleRM> zHLf*q?x*Hg-y_M$qi8&sEm0Ou9VQVJH4v1yvzfeCmx_)bujRj0cL_#+^jv?*uLUA? zn_i9@MFPzM9)9Ix=kSDptl&4&UGDdnx6Y!X7cnb^xnc_gDQ_SkbG4_&0V#+Cw3}sV zL%I(Gstac`VZS~5Lk=!uTqgdS@6dS<@H))$Rn*nnNQ2ATqUC5Pv}7d{y4~{>#)mRk z@v(Ygqo&{cM@rq0U5$3azxR;Z56w}$X35S(v_Kc0 zx_JF|U1$vHJ9kjb)6+i@T5MkJ2+Jvg9ZCIHXR7S{ZsR8E79I?+Vbf9u3i#{Fk!y(s7E}~s?2;`kQ{{icc0((^_lLP z*!D=VSLAK`O=@>@T-@UB7o3bVkf%=vymw zHRN+jNR5}oy?E@@Q9E%N<~yW*3@o^TUY*Yi4$5SM7N3BUj*U7t`p)Ej>yI^R)0Vre z{=o}YQl~U0uzBDja%0_<)j)*A0+MIN%HVIuCy}Lt>LBrsP^f&3894G{md4mw7yb+k zVJVnVfX8MG<1}>mF!M)+L9IA9fOUD_fM#P<6ccZBgdrJLC|Uk4TyVssP5+Jm-pIje z%2bxF-BzeIRP4Kot_cW9*&+7pnJ%!SC>1jJTC{si`Br_!H8Lln9L z<98px*Jp=f&o%Yn6Y-W`OrLq7(LwUvZ=bco&!NsqpXzOKpeV<@)tyk(cV6K&#l1%O z;ALTDTA?wLG7W7uWsm|@_kFI?rRBp^R;Nr^uP1P1V|7lVtqGf&PS?h!yCBNJ!pgiy z!Ju(AUi@loKBnlZaah{l3O}R(nwiRcT*$nV>-pCXmCL*DnYZDNgV|Rg^k;E^kV&dEH<%<$*&$DkGEn*>8F5q58?E0D!mBk6MUp-F zs9!2&`0nXkB=%`yKPlt~<=(t`_G;Y+ElwStI`vo=kwtF2*8W`!>)zjpWa2?;6_1av}ZFGS=Rp-SdSly7^*~H2EK`*3T z8QGsUrHh1ib#kx-E8^o*6ZpitTv%VDvg5zpWaI{o>tt1&k#k+BuHN-q2t4<&9HlkJ z&-%rw?Bm;DYT(6$jXmY4++_!?Rzv_|5s))^y6gqK)6cdvPO)OS`5A-hb_N)dy*{PU z698`?KEO&79u5w8X6=_`utyY&pXS5oeNdC$scf^=Tgds@R}Tv%UQBq!3EwT@VI=%u zgtvz9DG;1*f@j6$90TizKz?5kT9(-S{)({YcnCy!$}u zArg!#Rwo0Y63YGKE_c9s(G&G%`vK@2lPJHlF9$5>jk3Q^wTEQN`WL1=V&T|nPv_3H z3{;@fuUoQlWo^9 z&9CLfPVuWa7oHLM{wo$Q@F>z`uoMH<_m|#$>B~m44!Kl`74amG##Pr3U4U=x&VM8y7%$g>$@e)(W&H6L zcOS>m$#3zXp~tVRElw%St~i%O;^$!-iZBKm4wA9))Kdee;X6Qur$SlWI1F5u^b;%$ zwgWm{KN>pjrK60fG1KU-^JtfdoK~bk6tL%)()Y+n0=>?rdpRY7(C~!a@2W4!Brevu z4A^Cc(~r^3ytciCc;5!=NVwktJ7gy>XR_GiHlZks4Lc{W&_Ap`^}Gn!oZO>(ge?K? zRlX&6LplS@uoUWqE||eed$D(YV}kJd2bYx1${>_cVivK~YKcT>&(ia)x+CMr&3)C* zH=ykOD`8V840?O?y-bYoM9&mU<~tmmz*5fqt6$HqAcJtFVv7wM-1T?Hi9X>NR(C4A zA~RwK^`z8~G?O^=)9(&bl>UhWcXmBo4sTOKzx%QkyV*_9n7$Nu2CEs^A4a}!|DX$q zp4fYVi#Z0~-l{grpdbFzzA85_mIHX}o=me-K7onZ^J6ONMfjX? zo5yY;Qom=J!;9wgUBEox%m3}L88T46w#_Sx+g2 z_Aun$tRkVWT!f6LTDN)%Q1UsW11UcXPtyBt<+`xL=A*5=0j~Q)y>$n}Hzv3DvzYl! z@Ydb{V^xdoc{ae4Tthda0y*d;h1u*vSW`m5u}9bwzt#3sZ@E;B)OCV+0&KeQJ66G# z%qU}=IF$Azro9?<-+uYU)$#?VRFR6Z{tclpldrSvqcZfkXD;qqT{TYOXuTn`qKS^9 z{r6mkV^Cy&j5Y2_#bdFPvx;`tu}pnWz*vF={L@ll;Tmj=%2i5IxI{wHjd2~79HTnS z_~WF*gsd(8Dj856UaJKe?9_#xkKY8}ucpMa+#J0gwAKQw>-swI*C>k?MA z&eMCq#}9uNm;U)7YJxVdTQbC|+aSU#PWWyK4QtBG*Oa4Tw7!v7aUO2E;r|ytDJM;@{s3E?Yd(9_x_Ys{CS`9`Cm zjtehf|I?lefgD9>NtAKL!QTz=S20`w5q>ywv8gDK+!r#mvJc$ra)H#jp)s2jVaVVG zugt0HP_WS#UHs%~5pJ~~`dIbY2$!1=%*{K7KsvV@PN!MSKrO|PMN@bNNdIKoS<_mL zzQ|lB@E8apg<`vI4c}aO;`iH6e z=rrI=+UR+SVs*^1`OT}F%?#HBeYKFia{)y8?rEARcLEZ3hohTTqETc*{kDHPJ8ojz zi^{)4Y}?m27fiG5k?P|6a+kj-1m(W{^+#JI4@J|}tMARfi`lF>dwX@xB?>t$rr-bdrLHw3bQ$lxEhPJ6Asd$iZin6aQR=E;ucCQ}pE@lK+CP>ss1@5WIoR>=M{H@yQ;yPPxfgXqr!5Kmz4<|?{2_PL=+BBzPkyISkCm>Iy#}}D~=yO|0MNRBITrZ z(1pRDK^M`1Xa_jb)VzCgF&*6Sa1d>Dk3nlM==nmuiZFUMfjnF8Lz`0}oV1%ZSmygj zqoYSc5aoqWr+Z1cY&l8Ya{(+INbYXH!{!*0-*OlIp;t8-a7RU>`ER)xtg+0z$6VYA z`?s`~Sxi(z#ziid2=EwwQ$3EIPt=jTcbTW=EDB*{`=!Pb_8Q3d-AZ(HEC6wnE2{L8 zJZlrN*PkD+Y=Ju-Fb(wb=b*1yYCoqJ>)?0i-69rUkKo+p_^{vmiqZS3TSkHg&!DEy zVCnSbSY$D-G;x5GH=3V&r8aF?gFK#%iwPUT)uh$azy zEIYN<>G2E?|Gsy)gET*z>h)(H22bEM-N#;wqG_m~jOpDQ?pTnm6PUAizzS&18;JL4@Yz*ABm4B9={`bO$0knVuK!le8xcA!&H zCgr-Y@nl61DQ|#X*|lGi?qHZgA?g5WKZ~1>CvWc!JQcc^_Wp1{SK}``ZN0On>k<^A zO^YGxt;d{-KfO_RE9w#Hb;ZNV!;=fX2wLpobXoWjjE0h_W2(npvCc357g6nPFmJ$| zN+u);waGY4ORXp10_Bdl_j1;NZR+9)8mmNL89YSVl!;+E4y%RU!cbVcU-{pEStqba zv)}Oz2@k+%y|E|1UmwjG2d^bu)q>{^yW8nkl|Yr1gQDMVUBT-a!TW;+!$F%*_8mW& z02Kd2Nm}Er3pBI}ZcEeef=#=2upOKA#$6%*I+yMQAa|XvR-hCQ)GRLy*jojHu^79O zKw~aA&?B6@bTkrbN!A4yyD(zbpt(O*zPs?gCs*D(bjTpn1jfo`tq{Pn(q`ci?f?j{ zIN`e`JdA`NjPTYFJ_UmFP4H(4E+WC}A~-Sx-+<`d5`8eDr$FRqiCh@y@z^y#MH(;4<_z?udU7rwL9TQyKQ+9uU5Ki0$Ma z2h>3K^oB!M2KWK-< zc>_Brsy1%-iOwDC++^4o>DB`Nv4eD{gX7^Q^Hid5i5p(;nr+q>wLlN>GEm~Eg#(@y zZ+-?Vp(`RZ)^xv}!4W$T`tt_1urSBJ?PG^AE=if$ZP%a-*HmsC-)DCQFm1GCy%W3+ zmPFZ}cKPMQu<|@pP4RI2{JFd1nG-zlgL3LG@h=Lf{`8st_2$E1$Ex6_XOS6*zrB$7 z?t~YliQ8K<>+g!S-DuZ7Nh!l;5otTQb1z~&b?wjp)ed89TVVg<$Q1SvuP`c^86fen z3JmUNv|u#`?vST$-I(9XOJAA#25Ap3d@n*CLB@$JJ}2wfxUqUAzZ9GWBUu$KcZ)iq z%)v9h+T^$K_W|J=`85|LVP4l4>e2$&Q+`1~BPZZ-WUScv??q(55*4TFk_X=}iu=9s zYJqky_H(zUB5e7DU3{0PBY2#5`SnM zCA&!*iz*sjTPRVxsfHd@%(m*O8lZ|xJ(}x#wGcR13&YpnS*d`>I<4zf*(n?IQkxoO3 z5_3o9&1I!f~i%OJ5LMjkh)x~Uk6(tvf1s# z>2y2<#kM@lJDJ6Z`ICBW_d!WK;mQM%(KKh`{zTG>`?7z z(!c+LdAeC5N>I7jb9ycmT$eev>ic5>`hI+svXY+#q++`FviX&OK=I3#!S6<3SCNaW zGvghQ+juO~CQlu@W=1(JvqiyQ^_vW0E2KFI{TxEVnaK9m{C>HWYM^5sBHnN@3>&Q} zOxsV#LOn&Uk*m-WP3#jYf^6cqe0^9`=Fyf%3O?p0-}0M4Et~2s?mkY-RTX)^ z7K%z+R9vG?ih-9R6AhJy5BMxXY47Fg3AnQ3IJ;=`k-Ej1wZ%bW)Eg(6Im{UgPF$e5 z^!r2>;MG0ObX`jX)%G1=__8A#2u?+KI`v$_&KkeonD>UF-UOTGsG_@|{j5S#Lb@58 zxWZh#B2VfYSzU?M%Pj|+PF(gyefiK-)u7mbE+0tDgty%Mj}PWvNLWv2F@|roEgVdS z!qKwGUjK9>O>}#Xh5Uto09sjXGkQYlk3+`o6%wUWphLVS=cKMH$`qzTG&{oq$H0%H zLQ3vvPWZ$A>R13B$D5sxPCDV<$_9U>x32JE{SPMhJYPV@E^JHxC=s}FMtF&l?n-ua z2PgN{>7z;ei(>|s0pLHE#;F8~2oQ47SU*BI6s%Iek(>?;2O(dx+uQcKfUgS?dtNL( zhK{XtnJQ0;NqdZe2kckl(H}1w^?d(6*neqZ{#|4l2zxQ#Sa8S?sOLUA*hljQHdlqR zU#Lz6t)qKb_m4h>i&r=+pYxYtS}L}}xJ*wZbVtDK+h_%NbY7=xH?&4{Ij;tl#KvIc z#}_Zkm0R)jw7^i6Y(8*h{E*nzTLY5s4b3sVFC*m~_7(n;ehD{Zb=a%lwqSkiG>}Y} zkCSss=H8#UgNk;Y{P^bjaj+f(zq+tzpu^{0dL8{&ib|bQT}XTb%+qyFnAE$)<2>nG zeK2fV>ATfaz*fH;xAL>tlv!ab*M%u;wYTyH7--e~+`5BN{#cs1C0>q3ivzd!294Rm zFSq8|K6A+-rQ9mS+~aT(&DA>+_TD*_g(&G8jw1)7I92ev0M@6hb z*8E?1NHLP$;mlS49^tB%`+g;#%+ciMhLG`(@hCg{zJ*SC5zhaycu$(a6t8QXFp3}1 zfR47i#9xb;AuF9va{t5AdH8evz5icQd8?3+D5;c4vggUMLX^Ej_TGEX?2(?0|L6Qbh`Mvb}Jiq^f8`tx=uE*nkH)d_XoIsS7xLCA0uqYa!x6UJ4SKs&83g1; z6d%SfH~_j6UQ`j@erVV&Wh`M&KHf)DU!e>n;3p?F#kovA{Gy(+ej#zQXYSGZzCUN| z(cPLKR_R^dh;^>+9__R~pjfA&IkM7=r>vU#{$8|$gjbyK-4Y%~!VgAxYY3kL!TBcm zvji8B;B^rk8G>&>^lphh7|~N8^0P#)i^v-gcQ9faxfl{&nKGwJJl zEQ?4UtigE^hL{W_Xj~?KFX|oiL%C9~ntXAr#6o^qa5559?s%SG`3$ZP|2eUnvK*P- z8I8IQf&h0N`=YH?3Ra8vT=08j36;9+5;th|uN|+E)Z7o0Y7W0`6doLs?WT?{8q9D$eh| zC23GYP~%>4Qy4I(dmQzh!W)R4s40|cZ~;DH?}c^^+2i46t=KU^X{Z*qr;bAK8oWGz z0!+&}_@INIJK77UtHN`)D^@cVP!#iR-7*A$E+N3}K zS+^U)GA)XwDatG$!y+9iVw`}C<RFXB*5t21I?bB0gy|ICU^AeEtqnbV`a)Z56jA?=Tk{o;>aUENZ0u; z=xfB*b!2xRv>8*66g2LD3qt4qzE>)O#VO2M>k=LC!?NWM!z34=XObesYg!EHH5&(G zXZzu9|FO*bn$jrQGT1|GxEscV3W<3$cSD)Wc3(C4+|U*E7zF*C4hZT^$UoCOL&khLBT(h*5G9=xX9$Y)Um4SykSxYul z`fwVfV^^L^ru@z5z@dn5-rBZt=Sujn||;80OV3+U_Zk z@;ir3N%>i<)7IBV%5~w~uebl@4fL2f47)z&f_avkcenSm@jWm9?G0`$u&S3d6oY#M z$$Z;$*5=NM=ehU$;l~#%!)<2^Kw=P2V^6F<_Bho}QCM{!n%=$$7d5Ywye=_U)H!nS z(hhfy;sFov>6_}474cN?#4P5FWs3@aT`lEl=br>iN5UhO-Wx*|9WTaTcca1Zm4IGb z8+|}E@rTmkkp|uzDiQqiw-W9UpN|yt_JB>Z0~tC>X`oMkR#4f-2vxa7kAKOv#+eM> zlvDo=K!<{=P92GBkjLLf`@~=fx>2$frVtqoI?le16449>lJBL5+YFSTzN6M(7Mdui zbN>B(v&M^fBRGkAKb;yRyFZn!PK<4MVYOGjoF$1( zt5-g7RJqv!m>i*69=REUoP3^rWA!KX-bO#Zq8TfNX?yOix7Ie`iR;tQMf-|C3om-EEKXF=DGaa1f^*EeqtAhL`!wGot|lC zAYq}2)@(2r?yd;8EdCaPbvWB23r`kfyRHzA&?99~y%uxFCuRY&MUi(q%QaCanxb{P znho4kcYZOM$pVy0fBQ&&6HHEJzs=ppxv?SJJZQ)c7`Hj4aFgpFKM%i%qu^jamItlm z?1uFpRlp~PON|%H>tO$c$2iCBQmC0^6h_w+i^U`Io0%O*9&D4yjxyRC@Sx9yR56cq zTsO~gMsH^-5T?>-m?)@g%A16%fYDauEv79 zTnNyZVIRAw<_X{1LcK5#tVQ{p>E8alGqj?vU04lJe4ypRH=kT-ka zlek+ZW?Cz$u8q8ggPguyu#5@?P0a}vhYkHv+OF*NM=54NYwhHd)K)8KvZWw+Dl!V| zbf`<05Z#k}2P=Ip4VWG8 z!_O~Hh zGUtQr`uhK_k@5${Z_9Wq!|V~0XN0Usr3TV64yR$6_|LxEr z@iEpcetRrB19qjZw>P{6cy)kbHhr6X`9jKueOUOukArQf08!6Mw;2h0pe+-#%EvSIDE^q%L4iSPh$+^1 zdO}a*Htxp>#|7iy5`R@=HA^C3j06ULntEt8=4p9LMG%s!g_Pskp=iKZrA<&m5!M7| z+*t`x!IUmRj9c}$;eMZw`7QtBG*Oa4Tw7!v7aUO2E==o zn6vh-+Nrx|W?_o=tq>pEIIvItQ{aMBIT$!`DAe%c1gu$O+SOB>1-u5jbcS9V;!%Hi zakSkZ)&yO7IZTm;uBG(Gr{UQ0j%_b0+%!?%_VwW&mN>5BQz=NF>vhUUfkL3D zrqf(;G#-@ly-yX}^CCgV44-tFxjl!um zdVe{%tn!}Ec)A{1cfU_cqbvj?Ric#hG}lQzjXyzq8Ux^u)3=U}lETR?d3b<+ z&+lKWB|_24QfHR)ml6!ZnpKpiir|U&J41#JS|cZ~?|$!$v%px~%?RMz=c4!B>cfJL~!Dh8=PM8`z*j24yr?{-tY|F0A*D+!9$bLD5EUrxz%58{@nLQ?{Em9-3&jpud^MpRr-8%Wi150 z9E^fNcTAA!o1QSA_F?El<6~kbUI>PtHVi!oEd+OA(g*S4r|?kY%UC-G()UuJFJ!~I z3x@7Jr*$-~1n0}-AEoRHLeG|8{kcnb3v^ulbKgZOxgv-LDw2UyzND#t<4;D~ zMym-YW1Z2dJvi^{?;I4LH`nFeUWxp}-)?geaWS{cHm?g)pS-)xk--dw5!-wNJoNa( zcJCIm!z0^$FueE8zn%gfVM+Jc&d+WhU-ArGH3r`eNB-pvYN-Pgx9?zr-)26#l!T(G z%KyJN;Bs>M_nxKX&`~I4&Ov%NZ$F{$8pn#-a_UtzKA8E|;#UFREL0XDVKtYBFznRr zWq0#Mr=A*SMYml?LflDC%z8KRO>xQcLnUhXdmZ0{4>1lf=b+aCe@P+0BJ{LtU^Ej@ zSP2;WWVPYcJD!=|3IE~6(W%_(K`z)&MSdea(G+Cd*}oj%6a+q=wJw?`zl#T*{!}s@ zHiO@z+gyT$%<$SKZDBc%1{WtbUvk7G0!il5hw&^)fZu&jCH73m?91ZkW-KkBEKP}- zsg^n1ps{WKZDEa7rJfpv*83o?#zqG6Nk=rCqH$KMDg-R4HT>wx?7(lWMhwp12!#E1 zC;YrS9AOA$gy-=EZ!9v;+6>530pS%Ve7A&$k??~N-WtNEKybbZ{w%>oBzRo}M~2`V z5WQQX4@UG9i2N*(>mu?7#2t*-&k}nB;yp{uS@-*&KeVmAgKfKFrIx2+@Tk((e433p zFe4ur9vy!Lwa8s8cK8+{EfF560LE0XPtvzc>-bGf?Q}|fZs%3pKWRf&G3yP_R0?@e zxQC!8c!6%GPZk&?^S`QV)s9!K&Q&YvB;e6FuPN2HN$|Cd7JKJ;ZRBm;aZvD zaWn4~lM#mMzl?XdMcTopDVf*eB98b;bBXfQpgjmHqOV$Xb_0jiPa8gb5(84|Qbm?M z+ada*#b=m(4a;v-`{;d$hbCmy4d#@QI5znL9lV;3s0$tsip%66cHay!D~?;FeI4WC zi&;AqFB-M&>%%q2{#+w@q#*Bv(YD_N&RQMY=IN#1Q9G~D?4gf3=(1%3f(INuav zJoz<`;tl5zJ~5T%d|G<|`+dFU@h!RtO3tYm=4g2#`lB}LpLk30_skQ|+w~D7H;4=; zn-4=$H>jc`n7c9SEvH1~iV>J0&DmWcm4;%`*!``%VSHqP$DG5~4)v1V2DZn|P~5b$ zE2TjQz8k#!IIO!I#tV|2To+8o&p&j^D6;wiLvDla7s~Cgp8$%lOHf>oop7duG%RjvbPM(rjWp`z0KmJ|0446K{=kH2Y#Bl+FF z^y6Wm=J)c(bDk1-z_aefJpfWYE}9#X9Zk{kYZ(fP!4%&-ZPQ-vn1cn-LE7rB8=I{ z-{y7UqzBJ;kvKB=Q+@I_-vF;K3T^jpVVS(eb{`CaLjHd}g$)+Evn_vWL9uT`z;>={ zPRjdV-aw5tcvk#M3y6_TJG{N0t?8=&w>J<}l71}M&rUOx z9MuXi^NaTS$)z;RIpcD~H829?v@|z}ZzKYbKhGSy_nYFh<@Ua7%)xLYtJwS13s-oi zB>r6zUm8eS4&zO|;tvMcQjOJ`Jh69e?~(JdS+GJNEX_P37S`^HUB7iQ4>aj7m?Rd( zfPs7W4pIIL!zMk))eL^OLlfiWj+K*1P_&XkXg0tb&EFd$zxUh=yo>8<9JX*Iaq-z~ z?eAWNHMz6aPY#g!>mSp;p7u1xd{hq^K3Z8opT)3`pZD8XgB#R=UdLE+JTBv!HaZa~P~U`fVbKluxYL^Z*#|W84@O(P_Ctdyhi_j9NrhB0W#6a|r9rY!Pm;aO zOL5~p_4^*zlfchfhbV=jn~0GS=!yS?^XQTpSNdD%i7$H<4-1p!!YBa`hxgp^ zpkMW=(AZ@|RCvkJF4Z#y%>J@rXv~TShQ4;}Brhr^d%wJ&$?1o%9?vq^iQ9Fn0Sy0`&b$Zo4)-a)#XCA_jc zyy@Qn6&+zPKjjTl{w}R!+9U!j%NUGI+nYld<$Bwhy{R}_WIpPoaU|ri(D?T8Dv6W9 zwPe7qo&|Un)>cl=6~Nr1#hoZWfUI@_OC8(sem=#ym7nUZLF;-?igL^2wvCL z6hAR(g*4e&wVR@0@Sfepp9ilb0b!kJ|8Gl8u-)yq;7c_x!0o!gA2RI(^w6 zj&^OWaL<;(L1U4z){#OSJt0a%=X?WvAF0%g@^(TzM}MSiel5gQg+JUClxyI+_n(Qf zwg?!a`eJ_}oi@VK?@C{jrbqUN}H@nXVCda@LH)KzTT0 zli54@Jpz{MUMopdbwOUOq0(uMq`ijfd@>#1Cy6(YHm##cJdCoMo#a0HMKG38DkPsX z4)q8AdRES@02)PmA`DA>(faL+uey)eLzPU2UthO6;pS$x)8yq8G_zzSa)#uM{e6*v zZR#Yc5B*&8SOvWV^osMR(DMOUX3;ypJH!L7YEx?Yd zSjU%05eFj2Lfv!YKAN!StK97_%1gL`ru~pB>1RcF#R=an;bA2FV1&1Z@F@_SZ-PHd za1jY!7r~Js_y$Dp_W$}||LZ9b`B@^@MdS^LI~cK_CH4lydzP59BH7gkeK;S&w#mEM znpgEe*ZwDb7iWzzqX8B8`^OCKx&JcEkUA9=UueoXbvYKfeDRZ?9u5N1kw3atJ_cjj z^cgnpDN{_f(W|^UlZ{x!0yI=ca*&2da?P=C9_Tig?4bO;Xnf4~SHp+E0=&^bW8KSB zjw%9dj_G|%LF(r{GRZ#`q3QNNCI*7~kR|;VB6lZo?%xI9VhL5lwn|-WeFK?5D>e8) z!>vfv8LFtv*&G0VejYwwB6R~*jLIzwx%t6}FUHK!#0_utN$&qCng(rb?$2b|l!2L$ zUou4l=_svz=;a=@L?C=BvN0wy4$Ud^J+U-!z*Rm07s`%Wz+!(t3HLb#=q8xFQu-zY zx0t41u=y_j_)G9zi;xC-r8k^_9*`}7=f{9ys)|J=M!_p#$@iL z$`K2gCdr)6Diwnk6&s2F9E*WwvA|auige)cETZw=dfo{D+Fzo zXO0F$mBUwWj1F}&L?RX4PKKa6X&^W0=g_mo7Wgj5;~SYn8sLaIvG*2B8ZMl+lk!Wy z4(CL_wUbvyA;12>!ZNE!PRy73glFA`fJWApiHSgt#sNO z9JJWzx^<)m?qBgf%c7qI%5;X`hMx8VtZTk*dP2SMvQ((&$LJ_v@_<_P8>!zbXKQhc zuKqrJd-}_hbpdarF*&3yAW;juIZK2L+0(Jl!PY^+10#&)^iZyma*Up#8tRfp{0gzdP8pOH8fEY&lv;;{UfdSlQ7PzWtsJ;e0O_ zv|5CiJ?PUTo0D)D5Oljg9*>5ScJ8CPZ-gw?I3tUbVj!$4%gT;dh64|r^W}fEdcl`u-E_CSJxo0VO_-^NSUweOGd$euQ%`goF&HQPp2`>UF@;@7Y zol3wrY?qa7YFwdZ-m9hgYEPKCBU7Bh(I4}SU(4PV;Dr)rqur^q9MQUg?pmsSBrtj9 zqwt?iHGX9~J5c8r2?fc9IPB_UAm3)y%c-#>j1moQZ1mA$@-4cB;qNhOQXNg=F zkvAajV8niw*c%YZ{@xX$D*{^`g=&61J82u z(`7cK?>^N%_L`@~fWG_60sL(kOQ!y8AKHk)X_`mk1O?;a^vPufi9%KISpMZqN`45s zokjh5@KYE_3VwF`OPCkrt=u|zElB`Ma9uGmVl{(QUbQAql})i}>(4N+ky9v;XKW-> z(+a31u^wY?4+oj7KaKaqv_RghTR#*Z%VY6&3orTBFz9h9Zzk$Z0QTG3Y|Ly=MN(Ni zXB+LpQ5o;V;lqt_C~B*vFYHSo+Nc`XnJAf#MKv_G{nPO2Be3m7g-7oRZ2S7~ktvOB zj}&yf^d_!!y%#ST9rt&k9K#bvF4`03<5)~sxpt4=7>-u{qcyJB3LWlUy&ljPg2Mi+ zp=7FZY!&^z;8gk${F(98f6*`<{izf`aC^85>#gbD^=co7jy%mjY-S5dy|KW8gOBgw z!Jt>h2@|FuJU4oOkE#Gl;fi@6!JZ7=`1H=b)U1M2ha91{W+j#jb(|?+NCM=>gXY2Z zKBR`3+>P^@mC#RtZH$lW9z5cjH}s*mAHQq5 z6izRNKKm}~P_=sE4%1uZpD)&7ohwf{4F}7iGDG7Pr|?F2ooR?)hqVp1f!zW>9_7Pn zarKjDylSzq37fotA~i-B3SUU@~G6!Bp_*f-d5E09-JjVVogQ5$KK*6|NZ`}1^7_C`62O7KK5|U z;&^dB6{b_p9uW)9hIY~t58kR~fc=80G&^!VK%(tlQ6V;mO+a=z{T&*76AXnGp-@VA!>s_Rx3*;nm@zt@0AS5#|o?^avxO{1IeY}*=n z_ooavSNkJl?=KTnF!V0t=d=mBly{&k@%1I}q*t1}IZ}bh z)g-TlJ1Jmc%wBdm$`y_O;oa`tLTaO&(Wm@j;8B!s^3xGF^gLE{J3k96|E!?TZ80ES z-%WL2vpJeel-<6AVFtzBJaRiyK(FYRMQI8xBx`!&Dd|0n$I30{T@+&g#m6$64H+gJ zwsC&8%j_7g%KOE2`Hl(5Y|{C_eMl0$V`wW|ViH8hlQkpKG8CZbg+nf9`o;0e!-jWx zD}I1?^mFU@#H@r(RY*-9zciZ3h|K)^DhQTZq+j_ssETiWH#POLjRh3+)TiS3%|U$f z+_zO1F~p>GY0Qo4M-QX<#p-keNjOEfD%_UHMol1GuaH=9Xa{ zA6R)geON%66;ajc9eDTXD5f&>^lphh7|~N8^0P#)i^v-gcQ9f< zOY9AZ_bf4IEldP|^wBH9)lSgQ3r9toC$0;;K#+nW;Equffc~@Y zt+`!p;QMNL>o-R&AjEv+sK!YzG|BzmF)>CBvrdgnKkf{NTz4(MsIIW#iKgQP^(j$6 z+H7X-z*$`&zq(_Z<+cXO59#OL6B-ZL5+j`ey9btVyt^fE#TS+t_M7t*$U~)v4-b@; zH~_8sZq@W+bMR#$eTSX4A*zt7+3lX<3Mx$8mZ;9Dz}``2sXI;vSa&jb;H{xMw%`wb zc(wZ?s1tn29y7M||9bL!DWMCq2il4c$PT%vF1aj|%zYh+BNL4kZo zalD=R^^_%`y_1*J^Dqp&kYZx-Rf_-yE(&b4w4&%ps@@&rnEhCfy!pkxA_$h}zqb3o zv_K9Vw>F*0PD0L5^?L;zj^J;x`wDAI7~+!J%Bg&20(U2bc-<2Y2h3mkESV0@_ZZzI z_q|W|Lvnto-}4ha8yHCpSH#JRsJE@ucU4aUn4Ev7d{9puzbrb(Y2vO0r+h!nrB9fE zGDXc6{&IJ;J8u5WE=5PM;T195aFY(wKYqXau`MHZKCjEnKfnw3tu0os`yB_2j_0|v zbJbADRUUeE+AE-@Ze++=RTy2Pyhv^Sz#7G5dhqtzgrQhTQC=D;U+{v(j7Lky5BN?! zNd8jhjEXr%2W_(Ra7Lc9bm#$Jl;9Qic3n3Vb%yUXTM4Pg#Y~eo?oC}sRUeiP?Yeda zaar#yNV!);>M@=>H^rHP76epJEOVCQwvTt~6jIZmuDqQYN0B=aTc6q;R2GVME`BUj z-eZSchG*=^t!&Nxs6yy|DM9LwMgWkE(|h;4C>?FxaGeeQdJRMxRP*{5)MFEO zN)IjfN=Uc4v!fB3!N?IC>g%RHAo0?zHko^&z$A9k@Q6Aks?&Y(1&IT~Y7P&h5Pcu32e$>`m#626tD!-JY{?4MmgBM=h~v1!KGzD=(lr zUa90YcL^{Zk5bT{mjV^j9Qul_XqV?k5syD~T=NNrs|t`9`nRdaWWS%MJ9|LBgJoC2D)AT%I( zr02xPjlpA7vpuCJZ#~PnPJssm>~>x|AA+UyqO(4d_8nb4@)MQvQh@M^6TVx*!$|nS z2yYGHQy@6s1b>#`A`-kVf+Iul4T#%Ljsr2{<1|ZGJM|zGa$sj3~)?sg^Db$e4AiKvj0@Yk%)9gFq zK|W9Z9@8-r@A`(+qU=P>b|3pN>x>k1JlEd7_k1U$Q<5a{UEj?~P4c`7xE$JbE)<cef5WH&iq6`#%{>@wbzgZ9l}kES{0WN&E1bD^v{^Yu#`ueRzVdnIAYWt~{MO zV~uoVuV=3qIDr?8RV@K$y^%{+)cN`94_%pdTGn&}7*>(Xq@Ek)l&ipaI67z- zo|U#U6#edDlc3F~LdV}GHZ_@u;e$~M(~@p-h&J}tna^b!;J$O;;OK9CT>STq=Y>sc z#Q2QH{v(7ZuiX%9jO%moSc1ulus-$PoU}pqBYhBz*RKkb->UYR|)q>ro6tT8mint^E81o;%3o;UZ8*zq8qv7-UzX!HXpxrgEa_*S$ zf#4u^cZtkPsJdx`^}q=&G;&sAzuM~F7fTPUIvEQl zuHK&cXq5^}bh@NzVjZCGU?g+xe=_Jy$@8S%Do-HTDrD*L^%7vWKXPa5OcAWJ&3^W? zDGjm=k88gBtb!VDQusdkod9~La{|2O?9rnNn`hr#N?>VLJ#_jK3wQ5V@Yb+M$B&=r zEHiTWBi}Sj+JQzp@UtW}oQ%OAxWzvIuS8A;kUR7I^Dw@NP5$Qx6Lc2%=dDqD+4|3? zkU-PxQRuhtx!}7y&&_t zDe%D+UB6(cBem})Y%0~e$u6J7^7>(@j#Dz5C^dyp25b{6fvnREt>b1JN*@%TZ;R?Z{r z9vpCw>zQ38ER4{n-|l5W>1NMLPG4kH05AId;%MM`y(@r{hFQhE&Jl?jtZ}-S=8$rM z(j1}Wm!MC*uW%uoFl=p4QMyl06y`z@Dz?bU}zkhOY2vU{GC(>O=3 zo<17}>sJmOG}63|?n)Wt8BUo19-1US2fAUXllfJ?;n$cL6e;1^$>$F~Ew}T}bi?=Q}XyVsZ`ru zi*`FxJae{DhW);i0R0Y?9&odQtgvrBd4qB&Y;mX@b# z)rVZdXC5xEUBj*){L9`l2OvYo{9V&Oe4)&-4;-6rvtj=*@l`4YLQ- z)+>!{K$*Vc1sy$2G&$0gGIU4}sxS)Fnu(j>=D#z+5%c`GcA=6fxj6(V8Z}T&rMsi} zvkC=vdKf(|@`M zw#Xm4p1~D{@2>agyPpd|(x0a(V!4b#X;fW6xJNux*=*G8OD=`aHMAN;KWjsokJaS~ zo|Y)uZK!!f$PCH#@r(9naUx%fnBy*=i*VA>>JElSLAc)O{-zJfw`?NG8*j2B8ZP>> z4{!T!4Rfmgc^F;y2Y%o7gI!Q~_s?6i@lxrZPhmIv*|*!A?`Z=lvCW_5(3IWgB4VEz z^-BY_X(;#8;5J7FhIFfM^9?XC>DLYN&qk|B+}nLHn4877-BZB4Hmt5mq<)(NeR&Bf z*99A_Q@8U5Sh2W@@n=dhVh`ghChcc&X3i7W@FSN|_S;PIU5ieLcZMQ&d(O&qAak9s z<~Y%XIzLJL4Bqn4Pki2Nf!Z{V9(rQvf$SYz2R(u=LNpTK>)IaQdWgoy+za9*hiiv}b%^$lCk>;}% zU#`h1O=Hr2yxT#9hU9CDzw{(-+!jwcc?cMvCd2&(_UdVgR5+i61N@oChI~S~zc!^_ zL*mW?y9R!^p}S5s6sMga{(Q_?Me$oEPHbZ44E%8drb$U>2S>2u$(6ICUQrN%!THZA zhWmj`Rrl)~+yu24zw#+5yd zLvUd7l25+SRqWts`nDx39JSgHc=>aN0J+Kj7V14Na6+ZC&M@XSta5zcZhhPV4=}U} zs0;feN?sSX*+oUf_$jq$mel*R<2dft8%FZw+5SnmU*QS$&UsN$e)Lp;jKQwdo*FsY-?MPSk!^X^ls zf+XLh`J&arn1UX>$yTyq>!OWEuF}vyW7EaIv&l{`Ks#I(=Qhk7=Z43&q8F%njWAEA ze!`BO(lC1r@mc2CBZ~g_Jc%KSct^_Y*^D|(=#yvi;c;Lnq6l(xsJz zR%_i$MfGtgOwu;)(fdIBt6eFebe|)*Dz@j?pG*^kx~(4i`AcD4+8$Z8Oj)>)IDba` z5qPWl&%=(&mOPR2)%(lUgd6hT=m2h$9g(t!$@V9rdek?k~1W$Gplz#~q{$Ky;Wb zUv`Wn>JE=P$VFcWX-59WSN`_JZlJ`$l&1*m={z1fuJ49om|D2VTC~A*oor3&lLF{; zafuUEXW<@Y^~@%C9doC>mZh}PLNi|?rgLUKcOm|l9I6HY2yX3;Fv+#SRtF!CB+FGn zt<&pSbmEctcS`*+9@Y%}>B7yQ+g|a%-pB2>eYdjV?|CmeHVCN#P7q5tNFEjqQ}uG$LPp6R2=Ag*^~a? z6;im`uIOydg)6u!j*LuwSQ%db^w)JvLk(@d>@sqs@dt_Vw0x%gE?D@y;oZ8QZg`ZH zYnIxC#AQ0@SCBR#0hm9#?EN}PgT_1F@8h!&Kp`XSi~Ur)vBupPvdMSn@b7(TLY>Nz zkd8hn+uiVK52($!^qfW%BwVICy!#3R^2yk->PXtZQN4`8ijMl&ebdsM>$fRxQOkdN zW+j-ECzBY9sb|2^|1Hj%{wIjaItTalW%B?H!}aeXV%$Kv9EUG82?2ZSrg!GF2XPv; z`>D5$V))6!%LD7}Ca`7v!FN93Q^a5n3i>0d=0`Q%tJXOB zw|~CVvNcScJ5|45KN808EYjo{w7@f2)YV|t6KS}sQ{)KSklxR|<{yllkR1PXLB?w* zJmTzb6?)JOx>P=@@sJOLJiBsMaBB!Q=vtaSr{)KQ>#MB2eqTrHB8`s~!&QLu3;j#7 zU;U7z+nyK+3JLgACE4jqzZINdijWaK;f?EZ{?niMVTDDv@+ShUP2v6XZfYM6h(Jc3 zrg7(3Sy=fhqw~n}S`XPFXO}0s+IaM}!&kc#?AYW8$HI+ZMP#QhOkOm4uji@fd0`27 z8tcjGrObqyB8N^-wuZ;-khh}f>$P4TxE7#|!d$ITseSRW3K@E6($FEPxK|Fe`9+VM z{49Vx3eJzI<6Sr&+^1r>yc9xlI1>UByp(bZ(g}aWz&!*?tBMx9Lh)cvj(TeOM}V@cen?`6CZF^!sq_ zq;whV$O~EFU%3hbOs}Wu9?Qli>-+MzuI56WmkG2yRiwQ(O9JQInLI2qSzeXbmxON) z8FBCCiU%rDxfzwu{BX+~spr`#Zs49Re@T;r2eNu?H+qiW2-Dic`Ta31#urtkc3;ga zf~=>K;zu{5fvSoiC&yk=-(&b|3;pRlXd1xjk|`5{PlQz{*{hkt)9>jt*hUfo#iL5~ zQ$axX8=OG(?LWU^V<+g~-Ok)7rs;Ls>TgSr@SN&{#4pNONe z+ZUCrZzVyisyFi2^6e7A8s;dZd|2Z!9l%0&9)a zojG$L87hq&*z--$06T>Z@D-JNflt4*6(v5fBKjKExUfnku*wks{Be>Ta(TV6+u&}6 zumP!HS6IhlehR;L{ve%cKK!z~=)_KZ}0|qr$aI_YJGxN_@+{ zDDr4n2aRs(?ljnU3XH93`oBn4LxfkH@ZAy~M#2w9cxwos0>Swv__G8Tk>GU^92tUd zK=f{jJ{ZwcAo8LY0#tTP2%9`fC83hCNN*=xVr3~|KA^HQI(O~xVSlzspGI;cm_esfpQ&Mk$ zq55xlGAO<5WMdE*hrj<>9$-_6f^=>z(^kLDp8_f7#&;3ihqp^Yh)H5~Lv{CM_L;(a?>f@zJeLx}Wpnm6k4fS2TYuVFZ1 z8Tt4o`A`tJ>c@TU{e@2abK~yHu@BK`sGGjEO5Yd!bOhil-kmHbkcTr9HTCwu*#=NO8hdoZK)s|}ZZ_)N2_q8h## zm6w|*wc;-Bl4y+L3KCm)SOGnTo0kICy@11t$U{EF634f--j)0ufh zgWKA-W;2+aLCU2ed&^N*kkuck!8YLoo}5`o_$*K2Y`IkOe3sQktO}8XxJ?^u{HeXo z*X$0sXIFK1ss@4L;1qi~CO+UtcZ8!1dRm;r1_Yz*`yo?+(^eI`+i$jVU;~8+{tNcn$P23bB2BvEH=`Z~dFI zVs0N8KC%_zN!P+%kA_@96VwVj)GS2m%SIhdU%d=O9L)t&OjI$Cv1iWd#s*Be=jKIy z5>Mu|tSW~UvjbAGIw+VQ>k6!e-x)0&V#FP?$6<-&AzUPV!r|v)6sF>}hCEEh=*IBu zVt|MU;^&x1FFjxkSa)`FJ$$PU^kP{48VuA{ywf-UZci}mp^D4R_ z6P_VXobV8w!Q_t}#|sI1&Pj<1Nc_iaD_ppkzclI=@SxIN^7qA0u1 ztV{U*pI4ml-4Y%~!VgAxYY3kL!TBcmvji8B;B^rk8G>&>^lphh7|~N8^0P#)i^v-g zcQ9fjRvA#^0NbbbuTD0h+Jfjj?cdR(D~} z4b1c(XKUE*Pz2;@wI`T7kcMS^kLsK~l0R9|VaqH7!U}E))85j-e)>*p%U%jlusq@F zn&3s8eYSozLAI^Cki8_v~-M%eKx-TC+K0C^5i$D2XN+HiN z0VinaKJ9wvhgKiHq}h8b9QPTmKm6>e55CENGak*iLG9iQ5s4&zbp8>Q(s0)RG$1jC}A+UKLAfgdYrhT*_Pexp?J`J z#_>f)tP0QwH9x#*p9xt-DCPIBCISo3lue35MNrGK(65#?3{C`3U6$M_k2ccmmUABa z0gIE~>Tr z>FZTch5O7-*{E#j(=MCUGhK+i4zeiN@<*Z3sZ#OpE1ux=_+78l8?NBNR~}$u z@i4o7$Qp|~Ijo)Dw8F3Nzxr>kCLBgnKmX?yH$SER&v!c>YWdH@xVq8x&kuI9h5s(e zTZ6AmT($osQ-enQw6{6mkUv%EJBdGw7pJMc4vsdV;}weAye>GfO1788k%3h^v-+BM zH6p*hf4y56aBXJ04+gtSSU=BamZ1|O|Bt8h4(s`Q|9GLSmZVaottIVw>NIKZo%Y^) zXz#tX(~x!~iK3&EG>n8KBuO%g(kG;a-#gd$_50U#x%wmS_j%v<^Z9rj-p1 zth}~vjA?J=p8G}~i1NR*_Go^VMiRe~bO%O2t#d&rfJ-qJ1}_O_@6ItEJq zy{;p$o_4U?@pU|;T==4sn4XL$KV54)ubYC}3|(KX^MoK{R*tpJnMfod@;j)dGZKq9 zl;&O$vcW8F^;g%<+GBe&-JLuX5zu)z8TIN$2>NdH%$&c?5q(}!T>TtcfSi}r7bo7< z;yq#VVLyUMeoLA2bDPz{SUYpj;u+}boX?hPNOz!&49Axt#qd_^ot?UCcDVQS`#UA);?NP#mE?zYXJPJN@z=H0 zS=fFsW3Titl2`aP<>~vPrLey`rNwBo9&6kG_5Vaxi$>1mmP^hxK~LZP@~-U3pyuq% z$3q)_=-Z@=0=n%FMejfMPl%1fRN2iU#Xt2)x#WQ3T~n37YQkfoGd%~L4`0mvDUc8M z6-9Ozd}zd@+TVt*T+D?pg_&IW-ee)+j=xWZP9}lcXVj0M+4G^r2h;{fvb}Kacb*=b zM>&`;+DLiADh+=4G#F%Gt&O90EU*=Bv}5!Ct{>vazJ{Ahf3LFbX~eY3`-Ob$np`PbQS|b<(_0ej0lquOAg`}gi_>GRU8b=rnA-7`Yvf7`lrcMg_lHnC zex!BsWv<0pM8@`g{O)=0 z(!o?F$GLh^ zA4wBr4hwth4Zna`^^^*SGJs!B>*~#+R6m#GyQ)pnX>x52I zhqgJ9@*fh>?njD?cp(zGz*T-N8IOG`S(S1BrT9He;p||S6z(u>V7F9AAM>ri=;UfaTSf+QbhjCp{+>hLPygTZJ5oI|0~H=|)vtcm%Y#nOw3bHxC6`?@!sCorYHh?zE1k zOJS;VH?PNTWvKn`)9&x7g}AUV=*EY}NN{&aV_JyDA9W00Kcg9Mhxx>(k7RNA!*LJU zJs`ve@0q_(mK|0IBy>HhD#ugNe{!Ig46Nn zZ@1=yFCI~V?a03V?9Ee%S@j#oP7`Zv85CSN;u;FG7&v_yG(51_uK1U&gPJhC?7Z?# zfg9MJ*QI7j{{~*TX1-a%(1_XfQoggGbNKRx@6AK&9ax_3_w3cia1_(uV}9QH96qSW zSub(E1BXYbCNit0B6`kYS?Qd5-h?g5lMRd5B| zX`asd?@mG%A&-_x{8>CT&X-E!B4Xi6R=e(Naj0{0q?*K$!R=pbxA_KG%}Ua_ThJ5L z+y>iyF!;`wx}I-M9hn5^JlJWgiRjp^@OG}NZSRM??Yu$h`@X*hq&wK01NZgF2`z9h zZa2OnZ-B(4zy5pAw)LO?H)lgFJ9BO?_`zEF^B&oz%E%&<<*(GTD@y(Gvsb>E15aD3 zzF4cx#J-Pcf@l>A;F?(V(|TnSFt(-Z@U5R0nao7h76E0n%4|!eX7IPSx0pKS);AS= zl|JV56IFY7G1AZZbw3^K{p#OFLHc_}|LrT-mv;hO;fVj1q~eV(eHPPRy`_$&0)*63 zYtQ26E2{B*Ir%WvFXA_cLT+y;TR2BVr~#UtW88CPg8`h8Y9AH?-Ul z4?O)R2AE3sQG`+?;{g*Dw+EfMn6^Xoh3(t}DAmY+ERG@vsZ?I4`axv?H`-LxPdwE?c>ip$;IQW+Ks# z*&DTvX6T`85Z_O8yQGwXeilZB5P)igtSJvfREADvpL5c z5#^)K%t+u2bGbBCOf7WbT;~+aiDoYB-BQh+*kFTVu7p+J4c7ru`qNf!8ZuztmviFc zH#H#tI&TDDDibypH9n_U%LhAZZggKie;hY+zoJ;~>w{w@w`V?j^+88@@^Pyh?XZ(8 zSZhe?JnZNydtRN_39rQV8eW}r1)NlF9v!AFSfly)vF0zWu^TA4igLmS9)svM=mskt&e}&3BY7J}vGj00kYlV@BW$cqsOI!e46JWi> z5!`}Z_0)79>omfo?k*+D?lh3|llSr2@KlulI?KiAStHyep%Oja^YANU1In@3c=+SH zh5V5&FX?O(dRxz_}Zb8Bi^_L>7Uw0_uS zA{va*`}iP2|$^4WRk=;zzMHM^c2Sf!RFu8e|<1AN2Px~ zg+0QuhygJ5Y@-M@C9 zK1v#{smPZmd7t^ED|UQ154rnahzjCBtia|+`EJD%bO+N}$_picgI{wM4RYPlF_Xu2 zgW?iUktaKmMJOCn7GLNU5s<`}R#o4}U1mULzGLD!55YGc70OstniKcXC zG;&G!ooW7Ys8M&hNUw(+-}^&(?cv)c&r3-T%Pr+ddsAwH{ZijLs8Y6iXy1#! z{dD6HaI~Q4xSC*wW?BB$bjR`oo#Q*NSginFsd=vaxf{RG-x(JR8!8ag!w?rR| z=qV8SSt8d(Py67>{+ByfxDwIlr7x{h zGH%Fmceh~VG-+=z*3Gxo6pHhbrMvFMoQ2%V!2-KJ8p4Uj=I81YF@VKGIC3~k7gRr6 zC}uD+MI~3+_MGYp0}e;&k^@BZuxIuSpJAss`1KL(Yswri_;snBHAN>5L<@#Gy0Fp%~8qphdJbAI4V5L z90|~G^8PBiU?9ITJh`{i0OjsEzP=I=0{#dcamrQjLVD}JB@Zy10YERhw=6#hL}~7+ z_0vrRW78Kb&r>7;3-e65`FvsE9%V@LwB8-QJi%u2=AAk!Ir!I?Z&3>H9=np3A{q(r zd~Mfzr=kT4(qmQM|CI$sCr)tCJCeQ!DC)_co|KW+&)WU)=T6S@%S>)CdRT#+tyv5-x_*x2du$G3emwm0!`cP) zEqVOPpjH8uS#ov_I_9WI`W=G~nHQ3e7~<5EwLu@N?==iOi3WNn2lD-tY|)z|^h_YH z9cRip*In_5N2NXLne{4O;9Q0PQx3j~&oJ%h>Gh98S^Q2Lj6Bg$)D<|3KF?=`iZ7HDjOGwB(+yvsfCscgIY5=>=5stEwBt$m)E1QbsClxi4-uB(%OLTa8 z&h`>m+VA#YvO6DqtYh2u)?hr$-Tf}C1r$i`oZNdX0i728$DhTg$8+R%&gOvy{*Csy zsWQ};7P8He!OpZi$LqKXk%_&e-gfVHm(Z1eeXtQqrRc_XI}nVU(;4mu0-3Gn+#3&L z5bfKge|ZD1i^2cyU{4Mk@Cb7F1BV2$RZd1fB*j?}#1NE(V&lgD%~=|wZoOl>uH!j> zX?$%W9Qk%q*J@qQL7nZiMtPiZ;9T33!waJmSlEO`Xm2QhwkKBKuDr^^Cv`(Rj1>LR zp(j#H7oU3~>tLlfx_i`s&yAL?_Q!hYQdID`vYQpQo;N=tgtSTg*EE*m5K zte;1+ai@#H6~+b^klN;36jmMz7+)Tb)|81ug6Uh|jaGfo!XNz?G#oBS=5Yhlq@^R+ zCslW6W4#R=an z;bA2FV1&1Z@F@_SZ-PHda1jY!7r~Js_y$Dpmgs{KJq02^OXRwUya90sBlfez-hg<| z5_9&$g`y|*)Au1;#h$Y7lGlJOL;s>kP9Z7weV%Hx*bQ*b_dTTBZvzw3$J{1mM6hA< zA7F5eKjZT` zY+s>~c=T8#n3W7trZUJyq>NbWSau7ZtbBfUq$V3M#qe`hw#6YW>NhC?o7b^#VdBiD zjUg&6+^x@IRsw}pXn$O6O~;-wiBH~|hJc7;c8b*Ixk$ITUOA@66P%J-cz2Mm6H>$~ z7(P*M#9sonV_&#NBDr^7&KGbUTDd1b*ngk|j6M=n)eo%2ZY%Z0rp49pYl#)VTbvVK z+Tf-+^*0`IX+`}QpQIw~1;bj?FBgKk{;n6?5)Np&s-DeJlf->0G2*`V&*E!Rrj zAbCAw9Cb~|i%@Wnlia68FECV}UYV3t0_`9AJk8se1U`Op&i8C4<*)C368l?Q3)w!^ zX{Y}#1}T*-UaFl1;QSFr0jK3678z%A-02N7frfZ#fOYds<&J!q!Ki<2lhUB^Srw&ZO3nG}TwI<4-`QkJ2QR zI;%U#Q)uiMMj(tNO`^TZEp>1 zd#FD_@+m-vFAMiXR|gPBrt>y`7MiPl+U6p{&tksLJy*L?Le&lx5=REAnJ8MC$Jqk* z9YM8S4>OTzv+}<_n9E{y!FEqUBDh9mJ3kxl^GzV!HU({H7cl*J)QnW52mal`Ug=eB zZ0~1vJRWpx?+uWR?dZH=8cNa)D0@FwhZNz&pC)(#dhaJY{3zZZ&P|@+qZV_8t2~m- z_ep*My%mwUI`2Fb0pg1ye^w&yfs?a`xN_i(l=`5ByaV>sTCZBjb;pI%^Z8;MZjfxE zo4nk9OI|MG_rvQ4bwKyE*2}v_a!^?iZK$w)1Dx~AY^J_ugC#x>-8ESbz!^p7V#7GY zA%&{Oay`dc^XH@?m#Kd6m0oxem*k-lHJ=c_L|?$W}(0dF;bH)`YQor(4bw1aW=4feDej}&aO zGUt1THV22$C@WuO&LH{7-fs?`xe6OgG%Xk7i~rv%PWWyK4vs8(!X(E~T1@ z9mOw&?BtKc8$M*(e_XR+tMhXE^w)gQmH|bVbNs-Nmd_!BsYo!vBr{AAo(T^2tS!@* zOO83kS03P$F9U{*(bv(OUD5rUB6pzBG0eryd>%YF$Ea}TOEO}h6tJPU zw!&)-ZTzPM>u_ILa^WA23dnbA%cb_iHITD0&%00jB64B3dACC+4YLhQCSTci7yGZ@ zd+&S*!R{0GlXscwP)c}Nt4(bccb z&vUiEx*E@QImHL!PRu>7`lVm(1~z;8S#97zC64hlnU7+xh0+}3`ibhLn6h^Ts9taa ztLKo~Q}T<@^mT$q&rC5+qD)Scnu`bJu5vZsJukwe>o?b~8y7(fhlO661McYXzL8QN zvPLZQ)Q+ddhLoqOIWXGo>5T@47eIS|6m(K994@e~Cegilimx@9!AneS3Tt;0fn3#X zD<``M;81v{eaoy7o??}r_oj-6wVT}skBOAPHtqlXxzyuut-8b? zEqJQ0(28vIEQHvw{rH(wIA@cCY|LXw9=|5D5Pw(v0a1vhkKDno)n6t>*)AeseOEo1 znc?2ThpF4XTWk^fN&DjOOGt&LtCi#jgN}Q{x4kuZs;^`%jky(lcwBLb#QDZzO-$ST zS-daw*QVg<3uxi}<859SR_xQCR0lF>gZ7~}V_pQBqO>sE?%iH79sAb@quLwtucuIy z`AFp6SOj8z^m>~-ne;`9bNNPN~QYl-9smNyAu z-k!5xzfwKlf4Ci&nOvMojy{Lm&9)5xyX6cNW8?134+a568|DHl9uXwjGO=5iMITF9 z@&l-nirJ{5^0>;3;4watY@J01{DWNd=EWnYk&d73#im>X;M^QhcSywz?ayE2=lm9k zuk_5>=Z{~+X7nmj2lz-o*X~ZUc0??`)3jD(`Q8K_*VH@K;+_Gv_R$yVzm6vLS*K5` ze@sLxyOLgYIvHU;PEeKU zbT7rI6C5wU6Jj~3gl}|zHk)D&!`*`pfPLQqTSQP+L_cX7nNz<6`S! z_oa(eyHpbq;T0!*w}gk0@PiTF8p5YQaJ~utEWt%2cwGcXhTt0zy<4IWM)VYj{49~{ zBJu{r9gNt|5_<#UJxk2lOo3CM_MLc0`ab)~##X-!SGVRTO~-qpY<%REqkEjC%laaW7!7T>=CjI$|O#7x%PsBa;hkH3a87MMW z9vU{vL=`*E)T#5Q!Hu*2A_o>OV&)5w_kBts21DJ$IhmP|Qee4Rzuy$>_<20-V^ui# zl>9tc?*i%FW^byX;GqqKn{Pe+9qa??R-U^)X;*^H(-kr$-(4_Y;y2!qsVKN3@R3IU z)d1X?Z5`Rg*p53toVjv2wF|S%N76pLbQPb!$Ebhva1Cx?vOcsksRADUAjX2SlaS5C zO%|253LMj5KVZdL2Pe_6mDc$(l;%d)6V20#$A_)9j8+4%043XHdg%*DaJ;Cs?P4>| zsMtL}(w_{9*J>k5HiA$AC%Youav1J;G#%3sPzGb+QS$xM>3GF@<|WV1^WfRXFJVgu z^U&8v4;N=|UBFg}vPIuaFT%;v|M;EX`#|6M!#i&eq@y2`e#$&^34pWt=$%AMWh8dg zqtHkr0FL+N=`P3@LYaX>iDO7h5Z#t(5b zHn-rMrRef9fplE;`MGXLQ3T>NV16OV(T-hE@(U_bj>~=Z=m-;^E&8~2{ZPA7Eewp) zn)%C@jYn|SNXK#+?mI(PaBD3BwfIFS7`!$^yKJsh(#mnn{$l6=YxhkAVV!m#6tBU_kE zB!0?1!hFw}1M=CMltpf_LlI9G1NS6hOlC|yvJ#_-e?GtQ#!K4*f4-~Gm13cdk1N6* zj6Z_mvfGxM(2=vSgC`qeXoHsB4sE*mJL9voTSrM=ai|w)8zzJt5f4q(b&`h>bL)0& z`@t~d_|J_UONnS$NpaO-C|mspn?P@)p( z-f6-M*ae&aZu1RnZQDb(d$&A_$N%-gIAse{GM4;+#fv|p)yJiQhH22hT-TlPx_@~C zwjT`t?qI&)q)EtYdoZ4=Wi7#!g{Y$R|Gj6OMK1iCvn%SM{`96b&@ZSkT`Sfe&<}Gn z?yxsN78|$iTC_~jK*z7vIct9Sqhs*H+*bqKolB=x`Y{X&YI>)_dKH**k0vwO#{(3L z({#G;=R`{HyFcp4`yhc|fetoU13q~CV65p_B7Wv~xVpxz1YW7)s3>}C4n1FK>zSsS zqNO1D`x-TRh|bhL{m%mx^s=`gMwip_a#a24ezoXZ@jOW$Ou@#XLCS z#+|_$Rt=@7ebgRt)WZ8L3xt4b#;Y07`X8tXV_We1-|NWQ|_Gz#1pOAEpbU<@W{0KN!g)9DDuj# zQu;$8ZaF2kB0v`f0uC;<$1b@86CKvG#Xf!@hpF_W{7O2Gnz*T`ayJi}uspbu&QJ)2 zDRKE`L=|2=Mj=SY<_2h9_=>OCkb1K{pJHsLw2}Xv#s>_&i3oeHWXhPD!}DvQkYO+e zK0R6v?NUQUYi{&r%YU;&@UQ*FhVL924Zv`G&?pUJ_jjuk&BkiGXdDREHNeP2dZ4KS&$v64rpp zZarOGoT-H?Ecwg_Rl*Qs%trciIYWO9_@szw+&k zbTCp;kQyyH?T>^a(yy?-Nq~3H?JQ<#slvvi>x-_1Ww1e8H_a|49fhmdpT5AskGO4% z3)hnC;Kx$S!7RCCT*cmfD&T4?RQdAV|IGz{bUb@+TBZ9Lbaj3>@J6Q*^7LZOliBpe z;h$fsl{%HcnNwZvkLYr7L8^oFRm~E3E*tOOt7eE&rM+)YYZ)TmauGAeavq?WwiLo; z>;Q)Decv;j7Yv+jM}u#d#R4;h&qwJ;lfjSd!|c414v1~(EAE~?f^RgF$Lnf%BdJ^& ze{UDk9ZP-apI5w+^~me18}=yZ$tPJ+?1)mHSkrI&!TO#Jg*{Q>1uV~2{ycJZL{l_m z|2W@Qb!Y^B94!Ff{fr-zxQL2%m-{$Lye`E#C+FWh(h+H{VW8D z1NRsTqJY&U+n@|>5)UWhM64U1HdriBea#(63+-6_Y4qQ%_UioVuc9s3P<;AoUxsiK zKZrFRSnJVXPL>!IY)#jt zn_$xebTQ=`Tl=+u<4pJ!Nosj;E<5X2BP#FNmvOk$b#derw`)K zEY=YhAMS!_?=nYDxRP?fBe_?|*hE1hS>fsAU4mdw3y*A(5Hk=p=SE%)T*y7^kt}<0 z6lzG=OFeqT5jC)>CL)vHm-z!e|ZV3+~;RhqUHH1%r;CvJOS%Qm5 z@VW?&48b=bdbdO$jOZy4`B@^@MdS^LI~cK_CH4lydzP59wU=s7G)fFX4K;F>ADogn z)8p*E2YQyUp;&6zr^^O-`YHt7^CsPKAH4prY+oYUd2{&3OszUzr_3k1!4OBRyp4@l3~p^f_4Cl5hBt(g&S&pgE<%Y6n7Tj(mySABYv@ zc^ZwaqTtArr&rX!m_gnzb?)yk7~v3AXSD@cb@1vLReAb}5D;}=`EG4-0Fa5x3?!S= zM{HEn@9YJV!Qc@e<>++-uqE)z{@Pz>pf$s-qhb*R#(dV|Su|ro=DR26wGKKcn40H+ z%KWEZ9C5QnO_UZjW%rv)_mFx85l{2~F7ALo%wR<515qGa&i!ocZV+NvP2uG|FA0|@ zC%;#AdV7wg1HzopUMdVRQSb{fo)b59ppapXr?6!^iOTP#UIa+ z{4CF3`)%Zckm0m`-p~wiYuc&%l*L(K?7Bn4S-%46xxlqCS!JNotvY|qE(1JXGdK0R zTSf9A9ldE<;DofoC(pirRRH=cCa%7jPlkRT=DTa2#Gt%FFYEY@3-JBNo;y5g=kP9r z-Uk{s0jRNh;q8i#J?axMOjs%o1MQTrSlE5iVS93gxj0u2ju?H#&6=Bx-aBMv!l*dZ zaG7_ipE(yyvap0dS1yGbS?s2a@I1b}1W?C$Oj28#(IS;`2=QMjKgUwQC{j;3oqaqcBGlh)-f;y?QDS_=QaJ zj;3dVsvXIf>%5#ml2=tY6OSh{wd1pSJ4g>NE%46y1}CDvEbgf6nqZ_vUcNW?%|ZAq z)m4C&{9QG9}Ir` zIwbKzEEX*sWZQi0>5f^i8`Cz*M|?BLMSQG>cfXc z`L;!i~D|WMrjA5q!eJP zdzu9(p5aw!q|gIiS87hI{~7I_{m6MVS#zbgJ+R$=CE5#LGks8yXH={>@h5&Z$THd_8^pYl0Klbq9IyGlZc*P0dE#YA#{9uH)hVUs6oNt0ZOK=ehUKhcUA@~MF z@0RF;5j_PWKTG7gh`a%D2P5{g#NL2-&k}R?z@2bkXORJDQN_=b@t+Z#z0Tiazbgp+ zY~E)Qv8IHI8P4Zk=k!2i=a&m5NtAn|=6$~JcrEeh!bKjjUtQ4okjkW1m?~r=`%IqL z;g60izV&^m><)GwmFT~C#sNtxDY*qCl90JQ*FCS7+CeEfqbe>^ZdEC7=6YL*J6tZF z{&G!L7Jjs_TCX@61BP5``t0JYz=loevG;3YNZ75F-s+?m@C&$Bo2uvnMl4R7NK%A> zCy$uZnXbhHNyb6p_oN=Y*h}&u=_8&r=hpYCV_`b>0vn@ zS-;lEjdvZ`IYhP79<5M@o(V};MZ)AX*8DqN!Jj`=^XUhapv~i47QqP%T$?MT5|c>c z6%UPW>Cu~#>b^!vZO)P424$7jouGr*^ZZEBfBh0r?wkH=Fb%MOeYmWW^yrsftMtn!G{k)H;=f1<=`qs@BJDiAMTNBru6}+mecyz(0w;> z>PEkXbhsP1{x>ysD_jTBND2&x{)|9zEwd3hquFR^_m@jf;yFk?;FkZ_Wgk>T^Q_uj zKL;&0uTC3{#KEi`t)Kfy`wIhOBilN^LS*JxF_;^b4M&{oD~wr5eG`qo_P(EmNQ8Fu zpYQfw?!obTheDuTiPdzF%vA-o}m@66YJrEGKRAXQ4rO`0=SE zGhi)F?#`=giL8nw(s#-VpI4M5CuF@;rnA#Z4MG_;3)+ZXAV- zvaKh`xC$YC7SEO!M>yDW5?*x5_d@PFPHNvGHw6aR;HLbRng%;8^xDe zUwZRS{FS=6LJhy(Nb{ysNQ5-zLrQ(OeN&nqULeWn*t6%>*Bv7Uo@i?3;&(27OV>W4b6-E2Z8@o&vS-)3T2&Pz2) z?f!u9iW9zD!ox`T!3b{+;ZqI39+b8bhZ-8QhreoT!1kKy;~~Et@RO0l6_>l+z=Wvaos;+6P>cLi zT}IhJ5Os%ES@^Lm7+EO0`2cat zD7DZCUv&FecirZh0KB8?ryj)vZ`gmpU~o9Z73`wxA6)MY1G{e|8$N!p7`kzn6#=4IjR4<@>a(ltmMllnbS+6n4BKjscv_a5Ehqc{l)<(9r9T? zVP6iOIZ#SuF*SjNJ>-<`Ic)0pmm@uzT{llt~tF^*p*ilhYTQtdi`emFdv&<5wGZ zwNNzFl{P52T8REEmx7_d0^qvp#x~~W4AuVTJ-_~}2wINHyxM)u6t4sxdt6{i@(HVQ zIi2G7NAYjPx{gjHBZo1Ds;@tyK~08UubF@moH=utL+*Kt4J$3UE=Mlzbf5xQm{I=o-R_84ao_eZc8njZ zQq(L0QNH*7d26ngd{A}qQv#RHQ|-|XvIaKeg4_IA%+g-#L*gRh9mneyNW3mM>%>yG z7-0ksQ)aKpK8*!UD{kApTg;MI!9wbT;ja#dm`Oba#p>SrGiyGM;7}Lc;cHzvAc0C@ zJ8ysmz~Ele9Sr`?pl>AYXYr4P7s{i~n&7hACtIp;KHy*M*|xU3&a!MGAwa@rC}dm%Uzh>(%e-oB^|RY1c2?1C;W}*<*QA2{asFZ4N4R z2KGDdhhKh}hMNQ51)tv94f$-H^TGmYu$5@0_~@(6Ub%}(VxF#zir*ROr{^#5LVB@x zC*vZy;T3;R{*+-SJnHlGeMVI@X0V^moL4NslunCo8Xrn=Qjg1%Xq^%aD5<{oG(^CU z$F-=+Sn9!yOY2>3fkaG0CMqeEodh+=+W#({XaeC0pClnJ+EabAh33pb%sApK_$IPvhA|K1l@K1*_H&+g4sG+zh?7NQ)ZN$_ZalPk< z;><%|Jbk7F&{es;fG-l&j<)^vtMtQnGRZdleeAFfg^*U(B^4Z;5;4qQ;RQ!bdW)d3 zA?{JZ3-31su$IGR^xD)Ji!v{IxtpKGQij5v?=oUA?|!KVjCA5~+4tQWpZ%^to>pzb zmy=|Y`gNh`?->_(*Qtdu=3ERYJDKt|0Q+M)-&98H;dodhN`?|AGQhy5R$QNV0{*(I zWsg0*Ct|mek0Eoi0xk1{TE3r8;ryE&6r_k7e9_a+%}&9C$45B9`NyNR@I$C&iJ!VQfu+FAndqibY{e~7@v9D2c>ou<08i=2Uhe7urgK2 zG!<1AcD`@2@%?r&4$QwGI2d9ACMKGto+eAdk2)$8_i4{z>lk;93Hvyp@?0^rj>iGZ zz2&%~lbVG~ZumGopveQHKg6ylhX=qNzVi>^9#?S2>=j4HXIr#)*w8V>z5vc=A05eU z_JN1FWO@+Ei`T{y(Y5j{6PPQE_XT@~p&jc5UGnugaK}T|>)K(Vusn@{g>};hKM3Qw zJk6O1depj{qo~43JS+v@UAN7U{k*xrGmk{fp0H<(inPDgsJ&0c7!U*Zv0S>%&F%|t zF)(_O-x7fGdK{GhMGL@PS{E*^Q!ro^?`wNo$(1nI%ofw)WqrK&_8z;U*GgEPoH=sR z(HKVV_^bcHFACP6_}dh+ra)MXe9)f!bMG+Y>B6r^GvJ0s^2H;KhG4ezT7x!_hhIDG z|FF`g!a!E0fltp&(d6zALfc+(u2=W|`EJ*rrXOzL2}WmqbpH9lR zzwsLZuhS#UB+fThEi%~V&q9WmE8AQ|Xxq!-o8zno_|7fqc70a_S|0*pNPGj_%UZkL zyM_A?%WU_-Anj*srOTvz^^BybK6kA@GHbKg&UL}i4&Lp&0j%J9x_t+OGNnWLl09csU74>3>0(Q)P@~A311W!rXl^qS0 zK*QO_<@1;Wls=apUnti@ParFK9O`4tn4O--15&j|?!F(yVs)xHD9eReXGZzyXdg?)1mS{?PF=&*Ya- zA2@)c<=1uWp^mAXi4Tu9zSSkG&9g%o#E7U2vAr~coj>fm9!s3TDLkfjhf`z$qfov} zsjxQ;Hq2y+n~}l`>ArblIx^_nPmxbwNE|aUzLD`guf$Le)7<2ax4fV*I}z`wRE5>& z1%1oQjqr4n0}p+g9Av&Wm#xpMf&8=;1qZG&q3HKteta$sfF^w0Q$e17n6v-rA8*Zf zy+-XaKMbx&fL+f|{9&n=M)xcl9z}|OmQRiOImIAe)rkAlYrUF;L-sEZww1SE19MIM*4Sm1Iaa+hB z0q-}TO{cpPjvh%J{HT4=3DkV7OJSRjfdOnja+1auu&vF<@xcrJm^Rm0VlXNPsXddZ z*LxfQ3U{!0es^)hd3^7Bb}kj*$ydyJW4prfsm_=ws=bLwIBDUrQk@@|N~>ZnR5t@> z7tcCVca}giX5oq9H))VQ!c+cd|2f<#5vQV{n}PA^(Zrx;awsJ_->?)(4inPW4ZLDE zdPP<`#XNJ5;V4nN*t^yuSf}Jy_}pDScqH8QK-~d5IIYm(f_OH2S)wO>4R*%@p4)4( zHS4N)OYTv8eGmf&d1m+B)HKK=4bW}3i+n~9p_Q^3bxKNTf$z9E}p zfh37Pi$6N@c+sYsqW~Fkvm`kZueAK!VG>6MvdcxRllTU`SvC>dy<7Zrm`dL`-V}w_ za=uzPn*e5#WvWT}Sy<3Evz_b0Ba~Fzc>~{v1du0YN&YU<<9gd zfU=)e%-o(A2Y!uXgIxKK6@{$#-&0n2-y3?$LxDQN3rig|2paZW?NxJnv7?3VaqrjG z+*eBmvKZaAxtY}?gO7|245d9cgmkKh%@@plVfzX7tk+5;F4Ae5#Rh>?Xi-Tnz(Mja znmr$v?jhw@)P1Y=7PRJpnUYrua`gsydXROeBSUWu`3?U0{xT$OJmUfax!&9)6Syu;IDEuWM~W_)g?$tr#*h7}#<=NSyn-V&Q?E z7r&AAf}!cDF7qBDxVS)oJ?cMNIKp~mUyZsW5N~gf);Xb%f;UN=!J6`p_NO&Sj@t&fg%Sexk;u6a1t z;_OZ0CJq+0+_VDEe|0Ip{%VIaewKfJ{1?Cw7IsHZA9B2J_RpK5N(-H*j2JK3!Y|9+iW(7g|ZjsX6@ap;S}Y?Cu45fpk#qE#%jV7zOXpl$W3()?%Jz9 z^PKc~D93x1W!g3#{*wEgZM>Vr&GnjM&K8RYAW8VUw6ZHKT)cTjLd_1anaCcf{$-Bb z*FRpOxO@%=FI9G)>~VrD2hFtPi;AI2uh~AEUy(p%XNU4nyCB4Saj}uJE($MOnti-l zZ3cIx)3B%i4Tc~3mrnB@2?4%x4Aw@*eu#PR<^HMxOB5KsLo3uO7Ty&+$M4pki{;)p z{M0D-gnYcwT*h)4(EGFglffA>68x^O%l#S|X6~J0v^lz}IPx<+l!>1eYHi)L+K3e- z%~x?sYb_q^$P;~tQP&acGaKNsJ^#nkd53fT{r{iPl%z;ziIVJuI2=19WMuD=oxS(o zdxs?1WxR!Qa#To?BqV82S(T=Y(C?|=_xbs&tLy6OkH+KmcsXFT!0%qCEvYDTS8ZGXdxh{MrcS+&<)Z`+bGo-_HqYf@Pg?tD4cKY3OXU&s`%_@bi&8xBx4nVN}%?7*tt{)ofC{`BrV z<8S+s@R0~hTMwe|bX(GbegA9+>l^Rmha!2Pob=4aZ`I6ry#xHPh>?TL9s>sV{!VxJ zgq$7LmlcO8`wk79$UX%rD(}+BmFnZIhU+eVbsjk5N7(S`z!-ebT}?W!HVLOpW+t~N z#^a$k@4LPy2f|PlUGLf6Yv|C%kAOZ78$3Fb_>5LR5#F4cZd5g`L>eEtz#FAYxGPz+ zsP9A~%)Z&~_wiF1YMM}ZEhXWK`%4%vg26CIdc{fKE$Lw-{a~cGhV&_roNtmpOL7rO zUKh!cA^8Sm@0RR?kv#=6KTGDi$h-mh4o2Q*$-4o$&ysVt)UdW@Lazr_pF3@<$}R_( zxpVZIce?;XGiG%s?J!`^Tcr1uAjDka5HuSuu*OyY*>O@Falu<1XwK#SyaU~tjbj@q zgOHB7MvcW`OK^LIn_n>19epc2;BKqvigub^I(Umg7gK}B$UtW{QQ~XQFZ=sNu1v zUqy|WR6vC{Ye0@I4Wt!3`J9*Lmn?O9xVKTaDd7K)*?~4t7_z)GdG=?gB>cRl67c<| z9!USdn{Yja2ZnN-bDGyai>Ou9)?QIdg5kM?jbD}-pf>*pWt)tJ4ufyyIr*xq9kds| zgcMTRqou9Om$&dpg4^BF_kQvyz`I}G_O^sBbxg>!=>F_yBzWR2=1=PN!G*u`26|zd zNc&A}4Rz=V6ej(=!0N{l*gxf$dB&IlKaOI(^{R>jX5N?)WYT*nyKU*BcusT~=FZYB zu{lwUl{7OeizCx9UBO8r2^opY3_VMWw2SbxvNy-cRDaYzv1=t}EDP@8_%6yB9f!^6 z^`eZ^k_j&9J9lsYR4B#obNx}A9Uh!}srd6iHVPY{2;U35;idG(p6AyC(CpdN)z!nk zU{AzZja^@Yu#iNqj0{r_oHNS3(DEz?zkAEwW_>dcEfKqfnp7;9N~Iifkx0gK6A{Li z3wH4Att&SBT;uV*FY&L!2a}Po!%%n0wM3wEX<#dDga!Dvly%kMP#Dac-lC@&mVvnz zw50_8Cc<&okfvIue4Kdo$$f_@8T8Zb_J3nyvgj2jBT(JZu+-Oe7StU6y8Lv= z67&VvN{!aofysaw7xQQtgdVK1*vILiuJ^ru_GkI<%VP;iKaSX=m}WzlsKX|xG=%aB z@8$hCPbcT%nJ{NG6R>A_wC@axX7*?ilU9X8`7ae%pEx7_xotDUg(s185eL$$w1h=# zR}bZ^3xh*yBMgM^7RFZ$Js~`dva@E7chrfgAmDg$zi7S)lD8D9dMZPOm~-+i2+lX` zxm~r%pM`5D8tyO7SRkL*pLZ_CokS_qD;!_$9|iZXY}w=+;OVHM&E72(Z9@-(eJs%Y zZ6g7$trt)teM#}^U0YyQ8S^jKb@bBG{AcGs~e^yjTk^MdOKm@XUl^a1B z5&#<`zu7Bal?Tm|{)+^Q1DT)m7~W`%hGkvnE}g&Q4NloT7StfPZO?^%q)rAaUC^BQ-h~_9o}66!Z&j)?`|ddfDX+QQx-B{t)=CZ%Jt*Gv*cCk zc`1UwG2Z~GKN$BAm8%k4l1TpMf z+%L;7hK%hb7G&#`;Xv$Z9+^II@B;69i?jq$J!r2N64ZeYA5W%!`lbwQ|EiBW-9L;o zoo=&#;F5;(Fi6bxlnnalr2VMx#BXU@{dXGQX}H0MARFsXf$Opt6WM*Qml?xv+|Nz( znN?A?jKs71+GoN2?6sXLZ`42uP4`T5)_lisME%ct3vt}=tmgoi)L95ltf|3n8ZfN5 zZ@153O(4wj?AaehN%W?MesA7GJ`|;vHWJ>T2P|J79uIIOd~dVQ&8c*(z~YPbpIsld z5z}#p_nZyd;M2J~L;3Qxa7tqCl}(VrzI~MmMl>p-xU8VBf_tI zHD?#YeLdjBm_^ycoCvv{xr7S}M{GRnl#=)Nb)asXoeS+208c@oV2# ztl84&%+qrP7o`D0yL~G(m|sbI%Bfu+h*bbdhEjWHWrV;2wQv^?pA(`$La&`&B2c~N z)|R0?qTp8qVQDiH$I-XU+;Z3)5dANTE4}`DXrj-+WjB<@p_O8@=u#lsQSn0mF_8n% zuxq{dEnE}pyh9IkUL+yO{ImagR7W6Q4tsaO{l=KW`IwuBVk#<^8`<=VV+k{tP2VjP zq+6}r7|=vhPyee|pEm`aymT99sm;K{L8eWg0v<9F=^{Aau

av||<*5!>m8OACtX zfWvW{;i0IDAW%oFjNr)N)^paId;=`|CxiBRObD{I?pct1>yKJ?oi@tCC7ZZoi495>TmZ4S2SQ47xs+M*lI)6$LEF~~Jk;C(=EE~48j z=b_qFghuJI1(L(E(PYN^I(ENw)bH8$-bTy|Ck1b7pQ!NwC#t$T*q)}Ko5x!=n*2lY zYR<--`yU^WsUI-UeKHmeILjo>S%u)ptNUv{UGfBne1z_nwI-sNujVY!(i4XW4)*8k z67M*m#uy{7R>6E7kREf^L`^`@poN=PEOceG3?sa#UeAS8E z_{foH8!I#Lvg3?8kF*0cplm}9X`Z-7boVZ=9&SAN^JY1}%moPheu-sf6U zlSBSP-8sC0Y9Mm>RQK*1?y#*inm)xh9KQihhx4f%(XTBZzqpO4gNfp#A5Bp<-~%1M zl^eY=u05z*zOdv5rNn$YRT~{JUq#MqMh6A--0EZ?-xeWIB*d!5HfjJ|*xL8%3aO(L zC8FcRWd$_X*7;ChHb);iH*}X(qmiU)clNkJ6bimqH0QGFjA|Zz=@Kt3!N#6fbkgZP ziSPM`6f_C~(fBw1l@jW!n6m8&%9JE{n>p`zEF`j!^S*O3{wNJMJPD(I6Y#Zt$JI94 zZ8xuAWm_vXwu`a&G_Iai?>7S)ds=S>e!PSd2j09*SGNGk=YQGsbmS4ed!MTbI}7kH z+EL)pt-ME+xCTj{(9P`Mkt{L0!JefT!+ z`ZgpBU2HR>J+M@U`cl4Jt7$4lMY<1?T?ilgM*3hF)j|?lU8Z|r$=C+lOs#2T8?T~6 zh8lUIW|2tAeBbz86Abg@S_bSR%8_?`z`Y1dN3dWX#PqlRHXK;<7tGMhK(C5MGK?7F zfZH*zUC^%v&MpZhuj+&oKI7D|7c}y4%T&guR~&kVJ?t%z^5<{nXForx`Iqaub3ah}U4tcf zQ{phO?YB9|s679>x5EVx%Ro_qxB(ES+WT*x^|YkY-JG+D@f^~XLDykZconN_O$hKd zbeu2tBJ!8!7rxvT2mzrRFZWp9u_FATu6KB}36)dT1>tBCp$kEysDws&pJt^r`CR z8*{EmU?BEMnVAs?W)Kq}=?n)o*Qj@8>tzCm*q;<7>dAoKCeHGMLa0G6MIm3M!R2Y(;P?*1*A4YkubI9XbwK!s_Sjo=lP|9Qnp-!183toc7b z80oDceF`M!o8-@uTtt%BMRH_Fz5&_0CHr7xPl3$OlDRH2Z$Q3-k@s2hZb0s{4DI9 zx9;kK>$NABqBVKZo;eL&CnbASP%b?W^JS5IS>l9bV-V=8ju$t0=LIx!V?T~oM?nz} zJUc4nj@sPCL~d@x5x%ZmCfUZzFy(nzMr}Y08Vf5fw;*y(Ddt6i_n0c7=Ne_J32Oj) z>N(R`zCRwXN(-~@mkfezN1xgL?6*W>D=7kX@sZ$}MZV^nEf$EmkAC!&QU&g>bKS8t zSqMjYOpODJd;l732AxzzAp6_3x;DooT~{MwARf7rA%e*fWG-SkDAVT!9(>Qf-i;;h_*1v6wV=uN{S zZ;N6}M~%NG`GDz#d!LVYpTVsSixh!e=}?t-<*Pci4;mfkN;b8PKm~iOxQ}#u0GTE^ zH}{xeczmd8P|7P0F4xbwn(i@!!f|tg3+a)-`drbD8hI6D%+7zTA;TLDIdng^@32Qe zPWGu-bS<8_Y`mxRZY|!i_l1$^(tkM8&zb2L7s6{|GkN1BUDzmm-SAaPH(pt-620%& ziNo$-%_oVTuz|Zx{^hG%;I~h|qwBZZ*nG?J($r-SC`-l6TP1u4?Tvy;g4aH{O;DGi|-pzNggC$)0oA+52g&zz< z;woqdYrO4{*;zoh!;sO`Bn!}qsI(Q&T*KO3;XM359f|Lmp>a-*l5k7d_QSSLI#}ED z;|nAa1cK?sI@KsP|6VQ|NUOVozqDD4n#y6Ih_C#?LR2+!8TXfXZbkevFUBLbw#yn_;>Y-2t}C& ztT+8~+t;d_xZmBqlTA1lo;xBkp~W5zmh}ddpDIP;vD9Ndbsx&%@b(pSySoU~$QIV8 z^IgSyTVA_!Of^GI&R*pQ+7)2G{a&j%~9FF#!YO1hB6H`ZM91oA-rQ1pQQ$4d=nw(Y}Or?9jWkjT+5>c4dqAFwqBb zt_{w2#218WZZ)Un;|P!Z{lB0$(2#;-4OcZ_T6)Iux2&buz-!tn^GPQ7zK!5zDULwptrKaNnij zb;128Y+wJR3T{ty;H@5o|_ zLE3xo=Unl=#~SU{EkW2?OnAlQj|UEZy&ffyXbJTvh7yzvn&DiZc+R6~dGx->^`fPb z9i&Y3)>D7m4(Th3KJ9qq40gTo@7_>$f<;q-*+LAhu*9qL2TYx+xtUFl3`8vbn|uSvtSx3*?eBU4P@#6gMDD~<_-Al$xD7nY?-<);2 zVk4KCQUR6aY|6ea_tsMm=hm%VpP@XDhgc6veWOlu|1xl^ zQ{^P*bU7+YoAAGw>VQ`#O7l$ka`Cdf)o;D6)##71?Vqw);(e$p%C78ZICg((Ye9dh z456=jBBq5FI9d1PV@Cyl_%3`!;c8C~5-zyDsQOF=>($@Y$|ou$FF)V};-0O)btgCuxn68H#e6(r$4c0|BV9PwetGzs5?dgyF`JgRaWCfJdyWT!Zc@>VK`Y4XzRJi<$F7c9GA}&)pR6eSr3~sH^ z&jtTYg3In7A0Lm+!C5C>9DQ`z5X@{0i>SUSfxkaoad3^)fT}LaAKP<}pic)1146Zn zATO6#cvz0P51VI+tp;NzJ5u@R{w?ly6zkZIh0tdUk%uz!y3j9 z78*6so~tcQKZbK53sN5|{8mVC|DEqFQlG^vI#!org$R%2fpxp3GBwn&XQ5lDQxlo3 zZLGZUGRFRV@kgsxQ?aAe;RKFD@vvgHzhY(C2CuWM&7E{Ahs4IIG!Rt=$HqooW!5vH zPTqy}q401hm$+!c_}&&~v&K02ZV5w?l-cfQd8@Et>$qXAsvRz|`m1eGNc2}ZE#F06 ztHhEl4KwtIUGXRrrD@3hJXF@V7EcufVpNxyn7nZT531T9@|bkMbTv};2dTr5sp^d^-!d)y z-l1IKmXkLg8*~{hakK-r#f^b}gcroK*1GzzdpMShbdel4A~=x@9mkyN6QI+Ul>VM+ zd!)0bdnx!5@qLR*MX2)$*OuhCl6br(}qq%wPe z@PiTgnk!3$w+0_N`Ol|-d3wAmSnHio`pvlAr8Uuj#_;tf7ZHZ;NZ90cVYXqGO^yui z9Wu~6@<9qH41Rf7enJ~hfY@$6T5^2u3IrQI$8PSkRp++7-JG*6XFg^8EzZXi8=AQi4O&?F zN2o2E!#;$iHqgj{W57~KLy)~l2C1zatuMZ$2=c)%F&0-zY%f#t*ys5%ENL@wYpZiC zHk|@zO-HGS`-6qZi`&QX&rVLE#8GX0(at)fp+FmNiF&D6;bj1kV(X8dBtN)4BJyzl zv2-X^AJOX?EDXSepf+@_rh zq*`r#Rc=MXuhTh{PWq;3diYFYi5Nc+eZ!fVZ(@#VfA_@ymG!}-SDf_Sk{(9V4@P=x zNS^}9`6l_ZBo~q7b&(tyl5arvZpl6v*;63%vt+J|%o~vJVB~$4yc>}FEIDV@-4_0q zk95G@^hdn^`x}F+R(aKFlq^v6S3fqlB4xC&6msbMJi%Lt{@MK?Oc(s#R-h#|Wsg;# zcAurY779zOlqW=P0eI_<<6A}EaLiJwH2sRl8|LikeK|kvj!Qd}4qUMG$80YoLW<;F zaVLv%?XMJ5$ikev@8!H6EYTRuTH9@aSOi8tH1RrNKD&Fwi_1CazIIA_SJXxHX57K9 zGdU1D$vHf{SmcC*>*MdxR!X6TqG!84%=_Z4;q8i7&KrYVuGAZk%Lz_>v%ALg>o)i^ zjmq)2r35#8SaECJ#uu)HoXU>afq>rMGdISR(X7HF`FP@v+r#ja&obW$j%AyQ@-tk(mcnam zt|RB5BO`lvncaUd)9%&;YeEBz>RDK7GAV~SGD5?=qq(qeg6YDZ2j$SVVC2UqXK%2d zyVPniG#Q?_{Z+(wp$K-}TXbi4js&g$4bJa4nhkq+ddvFOBjEZW3)T#WNU;A{l?O9V z6f{zyrB%#v0NEeX^>%3MBc+!ewtXTwIFDahNBuv4nCt09?LAHSUAo673_@eUpvc5| zAp*AI^y^cHkYyY`&j0Ak$)_iwk^KRqjkrKaEPIc7n?k^C7XJYGVqY|`dZl&jmMtn` zJPgL<6QE~%_giz`82l#ZS^~Oj3C)vB-hb(hg=@ol9`ng;xTvn$@P!N;esh<1-RdkO z4D5gY>n-bP=vxmH6r0b&7AbZkR~{ZnoHbRF`@FEtRxe5cX?e^YVtYraA`Q^#+&@Rx zuK}scexHw{ISjY_r<%fWF9S$Co$mavQWk!WV^n|J;s&kd4u9m{5(}OywEG+;@~-w% z!{xg!MZ#;NHq4hg^g*oHt(Oyn>DWiDU~$};;K)ClS~=lj4ti(daq6Kxuw3N7g0L8F zC|RxaETNte`to%N(yx){JJ|#cZz)b{JIwS@N&3yQCksMSMM2BJor=k z$JwD=v+X-k-Bi;ycKtQkfcW_&nk;$P&}NxzqArXrw=7pBp0b6^mYtrv1GYlVs`!ph z2W=?D-e^0^r47G*I4JUX%oNM1NpL^548pS0*+-RrXX2Nq83bFHi*c2ofBtWWV%$V? zNMRQz@jZ!;XW#3YVl)*+t7sYPgK57ii$}L!hkARIu16-6qFZL--4-0t*r`3-JA$nO z+HcIfzUWwrEGg(3Y_`W@4d1fI;uJ+toF}$1AvOtls};YCdmD_a74@UMB$owps>mu_8SHdybvYn!)2x&cw<|7{c|hGJ!T`5F zvoWP@H3w|#a=&|r<)BjSuQ#su*dZ_+I>IUD1-e*jyAsD#qW4Q!`g+QPTX9?1qh z%1xMNNBauQ@@sdUgR(Zl-L&5%@${daSC;Zc;n9ZZ*;Bbk2)`CexWd$mV-;Nr`zrp! ze|Lyy-%+W;w?x=~AePHGLht^6QUzD=p}K$!+Vi%^n)BuZk@_oeMbjXu>TwFb9^6i! z@*oV+GX4Gd{r+WmjC1Z*ovS6@pfV18=op8F^-K0#IvxiPRI|BhhQa~kUD_>0>o zZVvqfsF%ikQKtPe{=@V$_fQk@-Uw%#bF&=LHTS&JN)G42<&~e3CuYMyb12s8O3wn( zV0D*Qd_J&lzWa=!ISXtzyTdz8mcK|9#z0TS@?gCyPoUpNs2mlXuNAOX* z{g74tvHSB52|wWaOX^hHtq34hTPxb-EstBYiloiCd;s4mAx(#E5g@tIRAAU&7cZJM zgmx);pxIn*-pS}U9fcFWLxVH=JGRy)+H$sw1M4k=@p(eLK&`q&?tQ^uY0AUnjYjvY zWZ#wFKe_1@w`WOfk83vsy}3NQ8IGz0&PNI1Jed%%aD92V>8*)PE&S(GkaIgHH#TXD znk2uoUl{TOvDJI|0qc2i+lYM|!RwM8F)u$(aAafyd%GKb;@8zIAD+{4l?2R5 z{MM|WE}&22CvMRQcp)hcq3ZUuaFl8|d}>)D6^-9wj{Q#jKPux*G*#W356->1!@=(F ziF^6KRtlC!BOG%>EsD5@M>?fTt;B`mCwdu5-Fq_8`5Tx&Ikf=vHHMs&+Zl)xGh`T` zNe+rT>FqMZpA0^n6&bQKcEN#v|1{>eW}zxe&(AvrG@(FW-RRm51>(0&ezbHn7;FT* zR`rcg2FC55-w0?20!=}8#yFz)M0&+Z-!183B>iBdw}$j7keqLlKTC2ENnRJpksY<xzDCNOc1C#e#9CRYDNOJ^obh zxL|Ca)(~{ZF9GhAbd&j15QFR17rqOodn3WZhOI|lx&ySNv#2KJ0bD7hp5_NJq3@Yb zPK_^~1s%JR~5S`K)W*6EEp z5|D|$`L(7W;_&d+y>lG9^FZ5vsWIKV;i%O2*6sG~YEXC5rN?tyEVym^{yqKaUmf|E zZuW5Qoa&(1r{`Vy!U4E1XvlR8Dx!hKLfKb-O6cglm6!KD7CY+fX72pqV}`SW1^j~& z5n!LM)N@t?W#oNUUOmmh3e8Wf=WqJKz=eyq{Rs4GWsWMr53D(&}|F#%!@gRUkX;z;H_1%D(xBXHo?vMnWk zZtgI?nV-dC15fu7xh}Z%bJ=Fz0M`3cf2te^2UX=1mA?czQCpN!?cGXcG!$_<KFCCEGioYyx*1u9KI-qmD&#Q3LVlz z+YjtJt2S-|xUBBx-pnBO%P;ABUg-FO5fP`GH)I(ghboOr$toAVBhTB_7ik16x#rQ~ zgF2wIbe%=0+zzFwU;6FwlN~PDoG-sttcJsA6$`E;y8~0r%>1xJZisEG)AQ&_Ys9IR zT=`?mPW;2oVsJ=|7xr-$zSZ53z*6j!^$l1FY|D@rmE*QU!SM%DI3XiY`n1l&cFP2O z*zN7?J2#)$mYh9{l6B|_byX9U{2f?d-S;A_=l>@!$9*LzD zn*#L}(`_F&LeMr&Y3&oG>F{aA@ydl#Vt=v{EEVpz1fi5?q(8R!LydHNxK%I?@kw8` zu+YedG<090MN>Us+ElgyO|$~|pz9A3wi|%)CWTWezr4|~=n1a=_D+`?3P)>U6Q7f^?U6LBFT*DjpbO)~mY42<@R#O2PqRBoc*n1r;e8tN z$c*>Qf|-ds2v>R-Fk@_n__Bj~WCeUsyuL%v<5P~n_RqOsZEZ&|<*w=}B$=g#(o zbNLgWA4}6MlX0k|v~ltD>lidXQ~gEeNi;gDj{HJDrGn_X-MeS1_Cb}RM}_(&8DPlN zJ7-}d3zR=UOUV#}Wyl8;${f!W;ka4+IrZnyGiZZtI>Q*=`7C33Wx-*!KHp{)1=PvLvL>=dOTy#L;B9twU#H- zf#1sIO3~4CFqytZdsk^CYNfwt>(*NU?i{5Y@BC&2f7nIvx3kotJ7|L5)1(j#E>JS( zJkp2U0lBB`k6lNlYOYRMKAE5_C{Wt#!x>m1JbLh8V-hkL59^sU^}@oeKCzF}X^DP% zbNbf=e;}57&Y-Kx11(E5K2wd-1lx6W!qd0{u~=n_$d9#jg|?b;vzKd(6HyCpr0 zq#um*){s60lJiaSXGtz1$?GCHG9=%C?A?-mFtVpW=4Z)V7nwI8-@(ZHEO|E|_gQkz zww_SZW&G6w*#~HsUZ6K2`T@O2-Zr{&zIFEmj$DSz8C_ zIog9=S)AHcZ&kpBsd~k4SPpRU3a0r5oj|1z)YN~E_!4>3(;%MU=6i-bIo)C*13jMr z?`t1zL7<4^!3b(UP_{oj?^wGYe0S$lt0h%1NE*J(J=7Qj`m3hwdiR;ZBkvPE_C+Rv zNw4h(pL{rt6QZ5}C~e5W{8y*;oXfWZB~b_BpIkRb={gnd*>WaG(4unI$5Ri}%Qo-h zlCXvDlzK-wkH&*o0q4|k-yo#=(0nVGMKGcrJ6#>IQyNwmx^C0RI**TTd71OqSrf){ zjxTkdJpnlCRqc4U`61JBq2ziCOK|OP7thMPIr!B)NIBx4f#`g$PehhqK{0{dlw3x| zDEjWR)(1bb(aN5j=LcpJ(RhpleWRBvq-f%Qy-nQ|ta`d#8~R&>j;_|;K9+U~YRI=+ zeu(7*rP&8~LctAH^4fG~8fX)LqT*k+O{C({jvH%U5ka7N z+`nN|$Oupz@T4>l4MT(9D!krZ(Z@X6`=I$q93IM2cRcnW5iAPOwH(dzM%CNb^SFoN zPl}0ES;4Ftsy9F5b zAN56zLL&=Y+Gfc6)v*#KNi#IR>L@vzX#$KZ9!yrS2Z0G;etCEOG+_TZ=kJ0{7O*=! zsOWz&3y8KEJ*Dy_@(0G-6Lf8|@S1e@oxb`=uvLXZ(o@(S#D3I_#h$G`aTDbUdcrELD`^j{-6>4e_kNpKd9pUsW|)}dq7foaR#_h>QV8! znj3pxaHO&j3df@he1m_FX#q`ns_lQH&Vxerlx=*+%n?O|6HA|r1itTj;Z=M?7Iv%s zZuP;@6FmPT@ocSL4{)}Ace|$LjyfHm|MT5a3bz0AFaoyymOm4dLH;Alkt>n`K&)fj z|3b4f3f6zJ$@#`WobGytPHxAbM#3f+5kKyvJ+1Ujg}8+m6~8Ut3BH}+_w(Y=Lv&ix zMm0}FfcBTj%{~~O@W7ir1^Be{Uw#&^U){L8g1o`#9j~2m#S(P!eP7_uaYFIylZxC( z35Z`)vf8{G#3T>w+uUbg7T=KAoU?aqTYq*QO~xJZ8yVn>F1XV#`zg;<2fV&GQ_bB; z_#qjD`!wGOp?J-YPG^sbA{FW$mK#G(h@GERX+zuzrJ8D=jyUZGgrD+UPPI<}{yh%T z)jq|*@*+jCt->|1&n;G#$)N~Dd>K{9k}Lv6wWqQe+1xSnWn+;Xw^XneZo?GxwiM_) zHQbM-h{AV-xQ~sqg#k}3s;`#s3cw%BvwH5jvAD{E>C@Ph6Pj9aU~DW%2A*OW-^$;H zG53eGy%8@g(Pw7qjI}q0=+c;xn&S6Bg7d>? z`}UJDEIiRnQKMpsr+*~%%Q8BG-p0aiT2lu^dc{fKE$Lw-{a~cGhV&_roNtmpOL7rO zUKh!cA^8Sm@0RR?kv#=6KTGDi$h-mh4o2Q*$-4o$&ysVt?_T9s#*kLH2>qvLZV-IB z_N2tCp99H<{z$FXL=g8@Igb%OKm4xftHwj3m*L%Ua4e#b@GmWjxTzU< zfyp?W#Vm|4rzfn}d6>>6|5RWYN7rk)3tX!zNKsoCnQtwUCw+n$Ei4+&9;}XD$uFhDED|T4=XUPNFz>DzQ=fASY1S6q} zmL0=@fEC)wZOGih{&cXt?yw1VbQCAEJ1J0s za?A*ypJ{j0dL4&;>sPt{*_RBPHC`TGd;e4V0(W6wsg*U_o_V^wn>7x89=ObD6*}0# zYPR;yhJGKeSlIS1^ejMq$N!rwe-?`L!m4G~mkOYJ%J8>rHdY*1^l9*D(Nc$J$4lb_ z;>yTyZk~RD@Z(+X)cxmS?9@>B=LfUg!u-!$6UxGxAN*Yx@t%-YygRIn${VDXdx1R~ zz5Ha8iwKKSsNNI2F1$(+cZcA}bf{LU+7NsLSxxqdZN~k?_kwu;J?yodfI8&Et%inj2V!kLfUWucH}hSTJObP{xP{xe6rHx0U7 z+%b9ZrZslj8p}X?h~ORVY#yVwO2_RcJ?i^MTyb8BrufWK46^*fcnZ#y;O*)g4?O-l z68yC!zA~Onlq?ySbF`%tn+7;>uaBDJW~&?-haK@~wWfzERW=6;C(h}nrAC1A{ZT0f z)`6f$G&xRL$N>wU&6r3jxBz86wque92t^tK=y9QJ{Z|k zAoH_iu8YhYkndpReU`i%kozn-XJ;E(<=EKU;PK_YcT|!rkl$zLJyjk(T*AAxi&j<} zmh`LcZ`C~ur^JMAl$@4@zn=`W-61?VlO{I0V}Vidgwo>aBf8P>=9ikD8|TjAof+QM z6T!wvT(ZIGf{z|td&Lvm(CLNyoy%{i`+Flx=eIHm^FDBZ#@#pgdm^?~WET4RBph8V z982CunFxc=EJ<>1vjr773+CGSI_N&9dD8Q4MX2r`bJI}R3bP)*WH@H647u|gAD6Nc zIdEsosC{D|C_QJxGl=;2q;u&a^K*(MEVaJ6p>7=j%a%q}p8C0=+4>e3{oDck+F_&3 zY%7V}4~cO+xo(1AU(e(o{NjegwF`U>8Ms3oI;-T%$y%7cKg|24f*u^WvHZGq(h5G2 z5imrC{*dK)vjE$tFev)AIx5?F{U@~g(&xgI0)Ki;fIY|N+osN@x!c}KmJxl zA+hq$-8}u7SiQ}`t?>cDvyn;EyB{8j*pY7ErO^yrcly20o_Btz<)J*aM5#5%GBkea z{VNiBt~0s@U&+Al&eW)!&yXR$USFtCKoy}JorIlK z=A?69*W`oR5>4!v}+I1cPMf zmebllG;!5+$^OS)*;xJP$gpBD!GU1fHSX#zLFD6YnHTRS;D*U2m+aJdKz%uw^2ruY z_|SNlhESn0SlTZ!yX}n_>Jn&~YyENvraJHH(>fZ0=YE2tlZ~gbpXa{&8;(Yx|Ir8A z&T32GA?CB*WW*0C4JC59^fK}OOGy`0uWI8yyx?(U&>Xy3>z#Kli3W>%C2Cr6+_1h$>f&nAntFlCIs3_MeTk1B^!wC?nCwdCF6QRxgEFKD`+RSyq zu~?8ybJ-sM~lPC91$5bi92DgI3 z%-=Rp$D?Iow_Xq&_WZ%ayd@fzNImWdQjCJ8_R{y~Y!Y$W@q4BoqgO%l_I$_NNuF>Q z$E5j-gZa3s^PG3G)pbDeAR&?QPbl27)A_ogP(C)|d!0FVvJ}*Du3J$34TKak3%k#z zrQt$>5E{z|*+5c9M?dnIE6io@(J=4{0zy(}-(%nu!Er_l##|6rE>aah$vjb{MR`@MrMRyDH%a`nU5h5tyA7Fq4)lu zSDf_Sk{(9V4@P=xNS^}9`6l_ZBo~q7b&(tyl5arvZpl6v*;63%vt+J|%o~vJVB~$4 zyc>}FEIDTteug}&SZRd^2Cd3&UXj9?b-Xx1{vu90G&J&RT~|L_A8|v$B{1>=$&TRPzuDRi}W1!cg~{teOV*k*+lMe zJJsWwDJ`5sxM~k5*ulD|`6F$n=kX7g+|#y__NXWG!B5rq4nXSb&+0raKg^f>L~qMz z5YY#{nsi1)7%yaL=oJi|1K%nh-Iv!okAmfXy%Sbc0x4tVYK(ujV34ouy}RTRyv0Nxc zk<_v`#0#u?d}0IH?pWt3XZj<}B&eiwBdRkp8kicYj~N=7;X1#{{z-d37>4<8H{y}gqX^@W$}ET#cr#^!64(C)pFlsFLYe& z+@1?_3Me*_UaQ@h@QS_@nG|D9!~c(`^Zw`h`~Sa0No7??DkS?QQ7RoyBzt9L@4X4x zd+(LKv$AK%$+40oMMy|mN=Z^tseGT$zVFB9Cw~C$m-D#aZ@25!Y$9pm=k^HvLiJeP zF-t#OCNB3OB7p+GIk(X?TuXt)mu>IOyxfkxZFY)<9NdHV+NK%RdvW8z)sO?L3o@9o zpXt`^_d0k%@R4Gl@;TV|Ead9s&mbVbC>zGv#DzzzTg@8IUVHzzK;h66ggKF#Hi z3b@QqeBbq*|#X~iR^%u z)`zlHIx%dt{8rfBg$+8qkao8Y^oHi@=?$G!3c&BQbt~mT7hv^LZH(*{|F)m%J=u5L zy)E;fhw=NdR}XtW970mY^*z_6B!Q?Evl0KKHVBcT4Wd1D4!)Qn5B0|Y( zHailp3-&s(s!($u2Yc8)?-Pj^1>VbU+*3MsU|r=Vxetb!J(HwKJq4IHJxk8dLaAkW za;^*32GHkN=y3wsR(Q+hsUWHMP*URh;WX&F4lf3*v)cF6&55%VMwo z*^}#|M%co{V|^sm8r!Afb%SSC`0XA2>pN{+p(<~U$~m(rVB*TWKVj7zZv8B~7`59U z`YJK+*`?_ZG-qmGcxbu7{J4Fw@%goTGrL+fP>AQibBgR& zU4&Ph@ZAy~M#2w9cxwos0>Swv__G8Tk>GU^92tUdK=f{jJ{ZwcAo8%8ZUjZQLyNBmtR$f!|pSg1u*g#zpa;WDTyb0vh0CnvZS?^Ciga7WLumb}_6rQE#a2Z4La+ z#eVd7q7MfSeMYM~q~788yoml}H?a0aoRxA$8JGr4hu^oXSI6B(S4M{B#36V) z)+-e!g4aLQPJg`_f-SVws2kk@$BuSRldzAEJ9Yb}G;YE=DkvMGKjO zy#2OrANJ(H-#pw@M&h>6BH6DZB*C{>t22r!JE`nZZn7qx}G#T0*Xa zl|S5JobS!~&=wTFdT{5xMUSnV}~>_KoC^jd)liT20gXLON-$x#UxUNvM} zVYwtB}(AWO7SE22z6q$~hNdJ%!%SM~q5v=Yq1oL{2;^Dsi;BbzBzD zJ?V6EFSCTN^v@R!9t=g3EyhpWKD*<4N8}a{L^A?3)byqHq82(+(Xze%%s%)&e)CK+ z*LSI)*q+a*k_z8&A;r{ej-kdX;hh!@7HIb;*$1Ckc%jVfYF6LoNk~#vJ)Eaw#l;Kj z#ih-%;N?A9Ek-(dBtJKpvTdI-*5KFq=M^_P_rsd(yIo(JcZe4}j~=+5UtN5D0L0Dd zlkR5%i0ZhvJjtgZg`_GmiSrGGc@KJ!__I<02Qh0pe7>beJX1f{53ATI9XKlZ$))y=Mn3E&t8g)b`%ds^Wa|34or+*Sj zIp}xk=XUy_S2jlv3P=Rv=$6Xw!Oo#jDNDGloZ1*2zAHB5@9u_aJW^LPEJMIFb9HNg z;5BqAbQ0XoQiqlwZ5AKP>%scGUHVhsZSZ<+=UVE4aKQ8QyL2dYMYj`J3e}UXVDGI> z@p@w)nC9|G-QadGPNw1gx#sQ#JfeScuzEvK`t2gVtZEAhuQ=hmB|MCTAB^zU5IzNh z^G)z)2`(bR>moQZ1mA$@-4cB;qNhOQXNg=FkvAajV8niw*c%YF*JA$J!u1Y^VtcKZjYRztl9)ur?*ZOn~ z%n;b33OYQ^hF6cF$jtF=P&aq*K>6BLq~vfTT7&mEE}O}|la+r8zy4~;%q-(gh(5aDwiaDE7QwQuV#dhz=Vsmi$!a6ZOq`=Y%0l1*7puP&z8VWtULN5t2wdB$nV0ZJMGb`+BLnI9t)5<9oA2= zbO7uQQMUXzB91R4HcgusKa|=k&~(#MX2gsl;t4LErdZAq@TJ@ zrD+ld{lYH?#8;E}RSc)^)>kBAg~bVfk5*aWn%4g6{#SG0BY6Dk{Apw8fd13Vj5dYd z@2n$-bA7S;9jb(9XGnZx6Sj2i@5X>9ea@^AKsQox?mr{r?thh|F>Wf$#KlXdr zSbABM0k#J8T%y`#2wOEa-n8hRfO>piw?gZ@ko>qt%n@D@99I3C<-Z*};C7AK^NY3t zD7)FH^G-P{mO1RU?#Qo;PX-^oAu$n#8kH3Cn17qXrWY4K77IGyZ?x<@f<0kqxZphH z{mWjEf%|2;N5Sy<3W>D+{&dRd`04G9l4rKMPmi(~S};+Q_C7RQS0@&vRP0~!t-fxblCcG~@=Uj|Wd zNgpNeXWg7v{_PEZTXIy9-?M5%MS0{o8<-0=ti`zDH#)V4bK_?*-_kN=gwt94W>(Xr zxj+`u4ks4tELcK;s2@e^&>!ykxX(`ac{sdZGD}JdhQVOj$-Or9QCM&2d;4I0Hrl(u z&hgya1aepN|2}jx4Ij@><{qLeMl>(fax~2x;if?58TQ?zeOO0h$+|->qLdPeEl6{O zw1AyNTMx%OmDdeMU>stf(=7P%&2;hB%Ak#xHv} z9ej4Ha4Nvo9LgDTMy1h&AkVj-)7#X|QPtKP*-lk=T<>73cG4pZugU%YyyArK_WwML|MPo zBzRo}M~2`V5WQQX4@UG9i2N*(>mu?7#2t*-&k}nB;yp{uS#1)asiUqF%9y5{VyF~= zMWzQs?k@4cH|m{yQcjFe*Ihqb-K&njh)vc^ys2lE80-0VP6L#Urfi*|lDdk9ag}+57)Y(}= zVYJ>U4`2CINS90#BoJbQJUD-sy(mk;qN7{dUv>Rqx}c6UgK`+kUplh(Q!^6hkNy7S zU}6W0wmZeTP{kl!ZBYf91Mc|I(5;Vh^e$+~Mxs&JN&(49F!l#Bl6Xtg8|VjTB>tf> zw^#LbGW>q%^kc24NYu^M;pR?l2ddl`N(xSez~zH`Yc3ia<8CRj!feuh_Tv{v)tW0V z=(N|Sk!+O^aB!YneadP8KoB=mPHZ^#4BYem!~0Mu(MIP?6%&q69nMwjjY)&~2@(OE z$REFSe&^@cZ468IS^YWb{FYdynA2BXzorDa* z603*8!hm_5gj~p%F089->M%Gv43+x%)RuOfL^W|0sxIM8SeEhUWkuOSKsna_+Gj2R zIHjJTcM{0Q6-Txyy=E`L)BP690!`JhJ35|Cqaq&+^(ZFJJ#s_Gt-c(|_|=H#y7Q$w z1#aO#d9&23Jv}hEvTEHb>+T{)_RiwR`^2WIsD$EC-9!**m|GPD_T$>l7H z25)9WC=X;rqtTZgvBy1vQR|94na2Uw-!pt5^~7<+A5LYU`rs-Z(_-BIc9VhNo0+ z9ABd>hq=4ud_QMi#X0da770U=D1dt){J(c0@VV3qhhr>1)UXlRSeh^+&9w`CdnMf9 zbUYO=_t65#tnFu|*kFl%t&cOs?ybTTI~eah&~Abg#VuXyY)v%uZ)Ya` z-cQ>a;Lb2ec*P0dE#YD8`kx<+@YWDM1%mTU@Mj4wBEjn-I5Gs^fau*4eK4Y@K;&nM zTo;iyAnstqewNr95bs%H&W=Qb8%;|yP+KnHzw%x4FjFbs>cZd@Oc=-O3_=g!;)%r4 zjqZM&FXn#T)mshxN6+-~r+g#KY}48Bh`W!Al|=d%xD9`D(@fHpWVGYQ@=b>l+Q zODkuU;y_P;h%Ki|4ZJa%8>M?B1j)ts?}+#%gV@W$d{&|haQ^E&+Q`{jY#*j0@I9sp z_C#hf%hjbL;adSeIg4z7(A?=S4Vrm)-u*k*&Wt+z!&lO3S+EgO2}dze)n)+M4ZJVn zQY`vlSQNXlQx`#o9j!Uux!8z0$$yD-@4GaSrst_p3ZK1tD`hEKf@LMssk5m_zJ!$! zX^s1CutHNeNj%X37XMx}R;w<822DSh-fv|mKe?>En$OI% zm3*$karFl#83WST;fuOZ=pSojLM>Fq)5?N$JBzQCe3t^*8&~p;lN4b&&44g-oH_Om z-o+F0Ob)g$Z>KaZm4;?1J1_0p-3^&(d3QO=cf%t)n6x*v??Dmv?~%2kWAHH@AD`y( zEWE@Xct=uZ0p1F*aA1v_gAs7AWB2Y!cuFCENKz&caVYM%8Ut?Q%ELQSplUEiACqyRhik1Euzq{MG6x1jl(Fb!c!zA%&gl zANyo)kY-3ee?U*(QQt>QCWo`v^*6_<{rUyiz8$n z2TnP4o7@w}F=E%q{Y9ujJ2A4!lmqnjDhh-DF2SQyiggW%r{U{Q zb+rf0nIO94 z2jpP2^j7DM1Ixyo{%$mnvBE?<+w(bVpz<{K$QhMj^hx$l%(%Q6{GCg|vj`n9vvBi- zYurUxdGNW!{yGw;SZ0B>`=K@Rm{wx(woN4UNch1BZw=v7 zAUNLyf0p1P8g>4EUKhcUA@~MF@0RF;5j_PWKTG7gh`a%D2P5{g#NL2-&k}RCUnw)k z`}J#RC&PGN?DGOFjvKK^xIYO~Gqluv?+xQjty@Exk$!-6(>v-@aT`?9XMf$0G>HSA zDDs?`F9tT2c9kE@df~~SzMY0fV;HxbkkNH11i9hEuAlW0T>KCT_6#SZcQFrWE%F@^ z^Tcj@8;&Zh735NE@TU!*&EU;(KHmkoLmE{d94smb?3f`(drn*f57tJ_x$^wWeAl5Is%$>KOQq#H-ZYkQu*s6KAopCd=vN!Zbyx z>^!d~IAoCD=*roIhaR!Wk>@Psd#bdQY&Uo-q-@8V*d$m#55B`>V$Evctq zL09_CdNdUO2!8kZex^Ohn3QZ%3^YKWUY4X^VARHqeGl{%l%?U`-E6WMHd3(f&vOxK z66blM!}jPJwW)6COfb+E8mjlFej86_L8kraUCy-P8$+ z-45qYkvN%dHzK0cp~koJuji!4r6h}gkgx@mRB{r z3J0q14`PpFEts~T!lYv12`mg9XM9sw0c!VXMV&I~z^Ml^N1CoYfp&K^UFt#`fl1P- zQHPl(yq)LWy{^6YVTYH$+Q5YxaP0dJ(eIB6@l8;$IJUbGy!?6Szmv1RK*`%|f&N(m zo{w(2sTrD#>1Fi=8^Y2dweva7Uv_z*nfe2_OK1+-(?q**gr*WliAii`?XSc$d#EQ2 zZZ$%MX_tzb8%01$-a+%1Q3^8pL3L?>u>c453HHp?*5Ts!+t=JAI-ucZ%H=cK_DEf` zpHkJh1m&{HeE6<<6Ro^`LH341G!jhq*dj(`c7Se|I8p*G==uE1#}wX6`**i}bL!~7d%{|W+P^(lvug9d@2_e~nklzk zsn8~T_oK`csW6E$YsXuqeE46;ud9se74Z3Jua>i63yGWIpD`PI2Woz(T8=Rog2R&L zvNb!hptG=);kBG}usc@t()Yz;%*N%r98^~dk4f*jxOprODA*rcTgqz1<3`I(mn!Sw z{px}eTZiC-P~_ym^<=fU819^8 z4e|Svf?H>0KB;(X!2m^?QdiY*F!8PV9Oa=56yrVg-ZX3wvMu@3Kd5cODu*dXa<2M9 zr3G#k8NI9EH1l2I*Cc)e;T0!*w}gk0@PiTF8p5YQaJ~utEWt%2cwGcXhTt0zy<4IW zM)VYj{49~{BJu{r9gNt|5_<#UJxk2lWyj6&o5>5XLbA8z;`V7+H70sZ;rCt46V{*o z_H8^!b5;}|w`_tdwI@tSC1K3PIzrDwbrXF2X72XJuLl-c-96zQIfd0fX4M(?)PeH_ zha2}Qq8wsUQGNB;e?C(rwE;okF9@*nO&w|y+%t6paVsbar9b?*$3#9`W& zt~xhFv5WV^M9fVru{K)ieZ3G)r0*^GV$lS-eCU?nb~WJ+>5tdR@7bbHZ2#sg>t`N5 z`ct)#?!gS-_-hO!zwNc#cGeMET*@r27I6o{ztu;!g_+@)s>z+cH7T%IM6%V5+XrNG z771#ejKoZO2Y=q0Zh+_3_b86}$AT+m4%b4@*W<^Tk*Nv= z@vW!1YB>!qIPUTB$!9@U@I&V>p8q-`z%DLM%HCx!boT3hPAAgccf~)c_KI@>R-o-U zHX54&8C49Ob51CN3pO0TFMHY`<}(akoTMI9!W)jTxl$+S&3{&LiS)h^IkWGCv!6c> z-Sz#p#XV=(7eTwI7u5^xf2ySqSM~~zTm?Je1>raS&;6T#-T>V(KJPj> z>3k^TU_ulaAG7uSqZ|ZACfev2=TdRm>w=%2hF*BVW`X_bBTvYoTp#Ah7!T;HZ4@T; zG7#Ig?CZAiDcDX;>nj^d$2S{Hl?KK0q0+OaFDNhzjPA@z>D-luNC}#y0FF|q_s8LD z6UoOgHg(>T_Gl&4I==7k0>2G13Y7hln3;+s47C{>6mK98X3G~OKQr7J_%MUy4aMx? zhGd@|2A4U>o+YT-xH`r<76r1;IQRI@yP%~@s?Yxp`yy)AGBU>#Qb7SSUk>LUJ4ogp zVeYxTWF7}>-QOBo@hK53G*Jz_X0%7y=bv6&j0!_$hI=w^AG!|Ae;gJj=Ma0ohB%P( zF<~qgC&+i(2{y5R_XLxbL;v<%ddcNmC6%>sIVP{E{CO=r)A?73C$|k|{H0q8EX1(B ziYZm9eh9K%ac^rX7=`!4m)~WUkHJZQeo=LS0yyenv71{W4AEqj*9h+`z#)zYppj51 z6wv2lHq{YFR?L94@%a+{eCi0$cY;KipDaXI z<*kMP6YhigL(J-(2a9pgA2Z(@2fg69^N&zA#b6*hy>0Ley92s(1FrueGK5|1Pc>;EdQhc$ZxSGIU7(e{Xt@4XXoU`6Hp$w1QH#xJsv``IUidyM-M zKfmmQ@~A&)>~aQpR+4b;rneg!jVo(@D{hM@o^Rh89W2J4xuSIgZ!)31tc~n$zDh{> zH@ZD2wF-}N=bMoC27!maoF>0#ndhtj&Do*<%%-zf@4~k>qsMoIWx|8jjzd2S#YsIz z(T8ttMB-mlEMubTjZp4qw{2>>8EStQOYwBR2(x#k4?6wqfm8>nD}n}$K|crI(*x() zagf^5)Tb+|fMdI;B%Q%Rm)Y}`Ht}7aIN-j-tl)AS9&(X%dkYibX4IH#ZKMf0OTAZI zh)n~f+>tMNxDnp2@+gA~?m_zn`91whSpTqS5eDJP}am}}W%=x$@W#C>0ka@t_ zQE{{hCVL7;^m+~BE&Ab_{vEVhpq~~sorI!mSfdlSxuKYe@eYuYdT^M)eYYyW(9NAAw>ntilFB^;HMsdq_SbKVr= z!q5;9;QRl1$sXek|2Uo|lWhO^a_hkz|F}o2&hbu3g0YDD7T49FGqGTQP*+WbRW!I0 zIX`(RHws1V{Y=h#!*#j_at;x%?Y0{y)o}9N7Cu=NAm0=4)XrbqdwyTQhc=%N zg7FJ@EjRf&kPw5`@_Oc$toqQ1dHmLrxgGSK-&=g00PlZR7t zPMdX~Gk}On+HWm*6-WI%l|Fo1A2&bV$Ml&pA28ntr23wxhP6_R!Dy(t*%g0wr z%~g=f{#B#49(OeVxO9ZECjjv579VD25c@x`IN`e`JdA`NjPTYFJ_UmFP4H(4E+WC} zA~-Sx-+<`d5`8eDr$FRqiChi1{ipXv6|m|LTx>6{gia&hZMLdgaaQz)x4E7EC~=nKmDEl>5MYx3!=}p|Xk=0> zg@4M1w8Ron8mrZt%k$il<_;o2`H$J;%A%Y z_hj*MGzC42I3NCcW=wd>DheDp@UB45%L|EWxjdv9G=;@@#9zL~3P0|g-@VUM5H6@N zQP%GB0dwzWyj}7gP>pNvhE3EN^y6ZpUT(x!DeHujlc$eq!HNE}twkgsK|YOP;$gY} z@DmGt1%}=3z~w}K(+OsG@JzuhLFj`Sdhf?>=Vcy++BWImin*tt-3JHWvG0pOq3!3^ z?FGXCt*GzSQP}`g8hKXdL0$`FexR*peIpG~3Lm1#jno1iFnZ^M<=e2`BS+`3Rwfd) z`ehi+kqY8Q!dUj)Zim%#w}sqelF-JU^H+apC4)+JN>t8X3rAPJFvSULfpPlBLw30Z z(28Q@%_Ex#JzvO;pnPTA{N)2{cp z!WkPdgD&)gZX?9+RPg$JxC>Bt%E$sz8sX!{$byK3ENo+tR+sQQ2)yPP)X;4d1+Xdk z%lmx*g~;zzuu{*16chYW!CpxiP_<09K6k@$6EAYlXPbj1H6^1R;#xrMRDbFF#j9vE z>wwegMIE5BivIcRM20{A^DJF@bnu`5;}_V@9OC8y5b|YVF9}BRG;?M&@x~jp#6PPaNkady)H#FzedUxl&pU6p(PrgRjXFO}Hz!AKK{& zS|d*RrdFDPX~`@Ki%Z7ni2rAnzi|dA=kr1A^-~Y^bi7u3?tTgIWwHqSesln7$Al$M z)C7RZ95`@5ED#(ja7`~!jR3_inIA$+tx%!w=^Qv@iL=80o_Y8?02#GxRamfJMknWE zeB4JaVvUi-3>lL!BoSHI!sc`h2n6SwB$K%JQg76Z^0d8C?)Sy{ter`~X)?0u?%Ooz zbk{H)x#N5aeyAMY7`hBfp=s`y`a4kzK%Ly0LOQn8_e^Kkj`xM%gFc zymY;e4!#v-LnO{giknFDaqTO>bMJ9RnWcB#`eKL7q$HV1+-8HgOX7-f)#c@L=iN5o zcKbQyC-p+L z3V4hUT`GkvO%$DCLz#H_Lc_x69iG_mu!w5aQ!5a3b@n5rtv_l!eAmwbhaqk$o;-mD z6_miv{dL9I0hqsfrW-!x3K$N&Eid!+1hxtc5iP8dKzO;e_=58}AbxE6@9m!wxWH0w zkDY}figWxfR$?Oyc5uZz$=504mE%nfe(mmPH2u@zqC7n$dfr_iqE8jdy}a4KscMUC zQ{V2kJ1T|CH$KGgp`d`j?mP;RBU!aKuWffqd371B*a;l#46+4pW6tE-u4v%7U;)$r zrj>BFOX0eflL`9OgJWh%p2Pe>&P{pSbD%crip^JBIjlGOy=t2fKlEb{%qwL)2ZO02 z8!A{m!S8#PPeYs_y5ekXJ3#M@dc7ybjqEJY1=h><_I&n`a=@tYFYKPoR<^>u#>;-{j=4x_%S3hb z z#{x}moHX3udIfzNu9qF#b^{N5X%lHbk_R`kz64octAO{gy-m=Ce6-DG~cO4$H3YXDa=<)IkAB8 z%cn<0IWczGN>w=`0P`<4^#}ASL(eZfHd`BZkoITiBt369tlNK-F5E2(K9Vd756e9Q zvpE?S#vFqYE5p#0tbGxfiT|1+E3qDB%k-WS2=+zG&r5m8q91J1 za@*d5j)AC&pRcP}K9sC(V< zWN@1730849{ofJo=PxfIvpj|H!$hmyI_wN2l>cL{a&T83{ z^l05q?iq{{1wX#GrbMMI?A#1V@J88xXx)q7O#&6o~vRk?SJz2E-kV*v}Gs1L8eP%-O6D zrE&Z>??C@AZ}iUim1EYDsrOp6QE2})qXK?03_AkuJ{u^?!OTuRXh8MK4o}=^jLS+=)qmUkbFdU#*gO3uaUn zu9-T>%1h8tTr&+exK8rgT&%;H+U1XPC8MEW`sXfTPIuJsGyZ4hq&nJhp(bSLg%vt- z_}AX}oD{q~E1hvII~{8~l<$z!&x7+`Wv;4rKG<&OuofwAfYal&=}C7mY`Zgcl(e6P z^Ghk@y#f6DtwD+Wo(&j}R-X^Nh=LqH5T8N`$c7n zHS{PG#j$twjWaJS(f1J3uCF{)P9%hr{_ z&7VzoT#OvW-R$YLDovx9`7d-|k9~s2C9*$p)l6fwQzl%HZvpF%J|8p|T)_Q0|51)N zPGc+Ew@IhFM)0OU1-sj=8o0 z3gzC%hc3tTPPLT)ir2~0V|k=KQfCHk*XzWLcMA*WXR<-hy;^yOlg%*9{nPT4T|8<` z)gC%5D2;j?Jf#Fo^B`5poy+eJl)?V5CkwX>^Rck`yyLQJH0tvSTN}jbpu(8vyt)#} z%W;;e{8WB3)O>&O=*^EcxWgfRXmKVG-7d@Zja4WGpI#-~cP(VX`~Z3>rn_}8p#0mX zPYb2^G3%og8xdo`RH~@ho=4)fC^Ft5vHv}Pu<|u-aVDvd(xp? z6}8yWh-e(da)8O6)*4ONPYv+4r+^3+{kPjIlYrFpjenjcgYD_MWd8@_<+XoY`W-i0 z|M8Lygf>3X&gLVD@JLE7u_RFS=fVrio2el8&a?EkZ&_$-*UchQPaKK}OOyMHB)^0n zxmSaw(o4yCZ^*qut3%2mVjd2fR0%Cv+$H(`z*iwze5`m&JujaYT3uwi{5D+$-tv+& z-<$zBZkOoAjp8)CV*d?itG8mCk3d<^K?4iC3D=YGE&;!grBmurZhS_4*6AaIHy%xO z=#DR|1cy5Ji8mXh!+k!I>E}GJV>LD~tEf{6zGZVgZ71Ekg{4xpdd5mHzhhRlu`KC6 z9`mi}`1N79^7c!+|1&GZ5LeFksX`M7i1R&M*)YZ1(q0K>)@#8hUMIxQLms^TtyL(n zhQ!07XZk?>+7BgfiOrT4C18sSw+cgi{9xW*ryJteF5s+vTP}Q`#KEi6SEcu8B%=X& z3fjEdZX8r$Hehh545}qXcmEPL##M=ZSI%@040J;Gv7_^7{05nge^ciZtl4QtBG*Oa4Tw7!v7aUO2E==on6t66SsQ{ePodvHmEbOMM>O{? zY3Ev9D;|}gIOCo}$`J*bXvm}ngAe$_m;9M#e9%-RY+`#1_+Y-HF+6!Aa}Q0Yti&xA;nby#wbI)AQ-V zhDmwAEPB2e10h!l-Yl;rrKXu#4Q~81i~`0*A_b+Y2%?pz!Cy zy$pelK(?jZUv^vzd1-BOhLoKl`PaIxDLqoghP=rv#gaQ<<8av8)y6bX@OPtc)+-e8 zbcg88Tr$Gnf(iomE2_ZRhSPNpD;M$bMJ3Bk;|NeFOqD^^?uATD$096(Wl?y#DFbYh zf+5mDZ%zCG9HH7y&DAcA8SbRm#hCKo1mUL*6H4Jg^Uu^!T0kU7dEYs-#Mzlr;8OtcKLU*P`0n!ba0@nLnQtx3?PzzE-{M)%#HN(TA4a zhbkD*Nc?B|PVzQnefgi0#aap8ReGaCPRdNLrtCT(4}Mln`JcevZDo(ap7fu_L z8uKOetF{&5Ip6d>Sw|NLU-&wdXnicgPN2SdQ0d%qrSf0_(TzA<|T z|F%bzm!8Y#vN>bE5wCl4&y$dzE7Py(VI6Q+Offw*+Y&O{?UuYAlZxu!^783zPY3U+ zEjoWO=Rit>sBnt6VMx40`k__e4RGrw-{FAhO1Pf+#Q&qwb>zfbzUfMF4Bm6?9~%9w zkKNzpfo867ub!H2*eC z(IlrJV*#&G!M~P>=V(Jr)JcCx+CkKeCsyMXwwYMdopopi-7`?i&q2Lsavk)%4M3US zSnAd%N~nE$Po(|`iBs_QRcsQS73{aHW)4aZ0#V0xObZ56w}$X35S(vNHw3fmjuyfF{{@>xdnKUxf9RE-4J89WXp^(vC-c$RZe^l7ueJt9|uL5<+AI5pDG$b@mjE;mrcKWV%Y z;0taIh95Re%mf3S>~h=GNZe=H>yG5TflJ8+ul@lWRDZdSy^zBk=~;5^mn3;v7CjQy z2bf9wr#D`yD-EH@*T$K`Y0(XO%E@_hPFf=-`mgFc1XQul7utP8ZVY&vP72Qf3oA5u zHj3%mh$b=;I#W;_>jxI6_dTL;*Tr@bC)b~d+L1i4tRo*^dZPCq=j~%IMWZWQK{xvo zwa~W@H~1a2wDIKWqx;)C4nVu5)$z+~8n7pfZx5Yc4A9uYx9ha68Dj66yJT{a)Z@{K zcWo8+M^zmkBexCTf(~}N{p#;-!3@5rZTe|lFzX=o)xg*u$gX+Kpzg$7_^v^Fc=G#Q z=*tnhc`>6Gny|7^zv=ITH*Y+>zh>WpRs;0cUo5pirYC(pm1XxxxvDvv!y>Kd)!i7+ za}Vyo9)l%WjidcAcu#MMY+OA$&`|s8wJU;O$2&{M&Dx-ZM=*UFYY7^Af3EROWg~1H z8GK~S>Vbr9F5B%>v;`^`gBl_Nit+bPX0*Nsv*4ms3;!Fw0+>H|KI3w|59(O1r|pq+ zN8e0WkBu}`<3;I5=Nu@jVD_l-z56d};op0atGiD4puTNOyB_NWf)o?s;;*DU@0sUu z+{`ztq0BjkxIwLI_*K@>%}mS(eG;aLnh{DsC;iL}u1!UvV{JdY%Z{7l1f6@#1wnaO zo@SVB-Xs+E2>f*O^htuEZ`V#cnVKNFZ?1@E(gBE3X{3df2BC<>FHgzd(1|qre?Gf| zGcW&nmKv)V7Rmk(J?f*UL*HIQtk;)=H)29jq^3@5TOW4&HV~~3C%$gp zu$1zLVD?9@R9ZTGvvkck#JUV-ONaI{l5#l%x^nUw?|tEbXuQoYCwJgJ`yxPFwG;;> zO)&fyo&x7CvzC8RZ~({NHgNbIYQo~QnvDx(rO^7&mouJc9MOej+L%bTJJ{z*?%yRoc6#8>hg0b^K7v>MWe7A_#8I=Bg;U&3gi;SELspr4@LZMc$`ya21;K&es1EP0J^udUp0+F93a$Q8;fVhJZ z`&nXdK)h#(IV-HVPrB&(AY8Z-%T_eegokG}o!L4|5Enyx$^4N47U# zao(^35fcJ?x=PaFWA7Y>$1$xqDvH}Bjw=gxNJkFT*OmerGm0^-3n!6wIzzsjVF~=m zlK1{iQ4iLAsddKkRtc0b84_R&$v}-ue3tViu0UzaU#))QCes2Q5;}NcmY@7|FllNu>N@*_1d;a(AHMTqO@w0*aJXJl+>X2yDc*+~> ztqBs=Hfw=;uL5c}&xS$&cM3d{XFO49JjYN=MhWx^41Us{8;Aa?Zrh9Rdx4aD4xQN~ z9tGb^k0A9NcbL|AwYs1`1un+4eR#1i5iFY+?z=o$0apJQ^?wrzg%@OHj>X+3<)%NI zcTFs&!i%v>S5L<1Bc*h$+D+~n@Pi@!N1N7loW0L8H%A~A-j#7xJ!_s0kJjo&eMJu7 z*R}q8JRld`b`N&Buu=>5?38R!iB7;>w4uitvjxoQIA+ zV(!02?~j%@WZAZSCaEV5g}zX( zkot=_z$aQ)I4&M1PM?c6?TyE_r$5hdt6aw_Wn#9fe8qT(g?7m-z80shf2K9C?ZA1Y zPUnBtgSeLVdUxuPYj|;dN@M`PBJL01};j+m3JO* zhY|dtYNw~J;IC9JbuoJ@!I5&8B;~3RNP&i#;s@=a+LhY}f_gK6-o}+a@Ae7!xAf}j zk=8K8JW20(OvD+;9DUClXC99I8BD9MFi1g>s**3JJ-U#=@skew0svp-zM6~{+M&Lp zsA|p4a=b3kTQeFfPRbctJ=n!&1Jh!IFTUP72UvpmxeDq-kmZU+Qf33`d-k5eZ_Rag z%ser6?5>CxJekY=WW4_>s$Z9xndk~fRZB13ACo*&3@*{khWA3D=jC{|kOgrpQT|tY zaX|sIEf|h%(w>D+@xB*GT^OxYK8d;gSQQtBG*Oa4Tw7!v7aUO2E==on6ttAZWab}J%=3-_Qr2d?Gpa`|& z^Cx||)DPN&UZtYqGKvhaN6qSNoOilYxEF>w^SeR_;Xrf9MTh~o3p=dN&gZCHTd}F4#2wqkEiqS=lXm9 zzY<9*AxRR#8&OCyjt-KY?7jES-h1!8vy+t~D{?pzB}p12l@LNvQ4yuzOTX{)^AB{p zxy5;1uj_g|?)RP>-}4oMplnUT0sik6SUzY%WT`g|JeA;|d)|}+9$3AV_1_ znFUp#KE+lcb)5#JO7XVp<^m&Q$7jRN(F^^NQ9^I<6O)_AsSHdz6LzhQ=Mi+k0Uf(wBos{OMSC zhhrXK{(Xq0@vqykY_H9cO{YA(w6eF>yiy(L%RE5pUOY%}_VIKU2WfB9v06$!Oq4p53o`Jq(onx3LI zB4~w+4V>p9Q7UM$D528#&jW#JY2X^^_m_1r_;hWbJE9Bq*!C>J%0{Ja{|8n*RI|;c z$F#@kw|U8s0K^N`#CT|y2rSjYWiV6U5fam|6Z*bF^84E4oZl# z`QbS`pOCMoW0CDz=jXEeM6?#9gCg8*k!zl@>bZlS@Q&}H=#;@6tYZGx?IT$^qA9R* zdTN)C)c;h1b7`JPQ$T>7&N2!r?|T>HzS)9fKJ{ua2DczSO4gfn=gLr`K@L0T9V^7} zGlrYH@GSKDnj_ENtB0Sf-q+Yz^@62BWGHep0W8M5rU|Qxp~FVX-gtrUEEi=FnYd7x>hFX8auLp)~p zDulDQ0csA=Ke=n{4OS2D=sXeK3|e#4%YHIkM_7`a#-yzkwmn<54LyDXOFZ#eT6!0c z=PrNYNfk)~Mo)%iygo;v2Z2=T+QVg}yh&8Um4N^d)3$S<=tv}>xV^dIaVZ**-L5-V zzcUTcy|0=+daVM8dMjhGedm!#X(I1--eAQ${a5K}H}qw)>uM>V1xl8c{%|*!5yddn8>+A$p${&vDM=jvbnoA8R$EN=6lOfuv}ZS5d2sG!VnhS_YMutUNe4mo zhN-PJ|4#VX{8KBiE=2FHo@G>HcnY^Nk^)yhH^N??$q-{zH)I(i>LBLc2P6C1?NC%0 zBAdN+sog5hS}(Jc5|Vvo8*xDjX;{FMFSeKr{3f0XA!Rs|`4A9;0_oz(x{ z{puwHb1r0}*JlxB4@aq5-z4uQN1$|YDcH3=95phLwM6^5!_h5~PJZ{lh`Qf(gCErXwmndQ*Lb>?WX;3qPhMyfH1Tgc-*_PEm$5!PTKJ zsR|DFV9&&>;8D?saMVuZ_z})W5Q&9lO^MaOy{e(|*3$XF-`)bg8Ms3tODq8uwnk{F zW_;b4wiFCk8)Q+Z^Q;oqXiIQb3f^DeLfez@wjO6HtJ z(qjad&_CprW!Wff7Z4dn}y3A$n(l>DL|9+YT~E4J%MXhr=*Jbv%EV|3IXG3flc z*rr*JKOHXc4^c0KgjbyK-4Y%~!VgAxYY3kL!TBcmvji8B;B^rk8G>&>^lphh7|~N8 z^0P#)i^v-gcQ9f_>*TStRD*@ODG6RP2^A;sjRwqAVED7w36 zsS-}ipGtUkCku^heAu&LV*~7^2FW3bbFo5urRr-y8SXNY0jBHM@hm4`R&~4u1Jph} zF}xm+ds=EQllTTWgKKiTcME^tiQ4XiLB15N?VbWuS~c0Tou7SlH3_}=;sz!oWd7w1 zvMYUCx9?zW!5_shzl;Rn7t^cly@9CY;eYe2p8MR?&fF9b^vC+8#Qs)1FhYBZoW}zI z{_#rY99>j%TYes0af8%OAHKGL2>jWL*Z#p}MQFQ89&pTpl-vEPrjVfNjh5I)9il=2$M~_r&D2z&enN0U6&jkS z#FPMOF}2{`ugMr%yZYPywSy9ow1ra)ek4vC*RdIKUu6A#UUrA4BQTUXBG&jS8Sv-{ zo${^A1rt&%0>xvAAf>&X$@s7jVwC%-_>SEH%q_ORKTg$&6++);*ZY-%%b9C{@$HIh)89-q9o`+bVb_6aL^I@G-7f}CUX}aXeA5vN%dpNF z$(tmf%g?X)q#sB*sll<#TZJ=@-kMyn&4k*ndt`X3Y*6lJ630oH=Ztgl<2D{$Md#)zBp$j)I^aTingJqhskM54qD8TNdk9>PA=C%w_z5TlY z3hFG^sE-Qfy1Ov!4? zq`VQwr>?BgU@IgbGfv}O5(?CqD9px(-O!&9jpE^uYxtS)9h%*@N}ybnOhw(C9o+lp zO6Un6PZX2Fs@7~9jnpmDU9%230qKQhrvLIKQLe$t|M!NDUNhVF*@<{<^lp2WZg&5F z|HpLYZHB2A#vo>==I!Yx`rv|$#``)qFVrz2Ur&)h;*8QuyU8B~IZIKtG%`E%yerUaq`(^q2E;6u)Vm@Hy>As9OKxb>tuvwer8<~o zzNQ~NX9YV&m5#idy#ikN-FWpgQUys&#&%yh$q%`MT-GmF+JV~&^To`|p=hvHjB55k z6lR$_W%ZwfBT{sMp~6(D=)ZW5z1NRs;r@^}rpcogfceXP8z%bME|waJrSt+RSnxo7 zy~e~I{JNtYR6*kc%p2OyEAEv88V>h@X5R?nnR_~>T{bSz(RHt+En76m%JOWTtqKIx z9D8zeYJ@<<(Wn87pH?_8YxiYl-CW3j9G~E1DkRPAyB7v~qQPRjt@_1##%O$YcI9Ki z?D_X!wLOmH(ZQ`-vh6C1FrueG!NAc`Os@q1#=I;bm~yW1GP9*d)lOPv)d2744TQJ(C&aQyuzRI zoyo`i!5e(_zE;TL69aQ~ehC`A7eBHP+9#(!b zK0tOo0I~XV$?J!xgK>FLkG&pofcxe6&iDD!DDJ*3@7XuDXoo|=J84p`%Uzbsh11vu z$xE;F?gX$In=RBYU+t>@2neiqJ&Yn0h2YXa*g`iNK-dh~nn*Y+I@vNG(Ngt@lB zc8)udz1tCZ-}t&c&*IIyJVllgR51rL7t=m_8MuK)*Bo$0%I+Y8AT9$6WQ*9CdG%_@=I4q!!nf0q)o6_}%3 z3wr#-5X437eJ@*}2be6Uel|*)qeANFo?rvgylB)_qnj3o+@e?2hBf{l<`F8!HK#@r)uDy_dnX7o%KV_i~H+n zsPzH)uanxxvaUf*=UmFRUJD@p+GD0f&>O`0l%F-`%!ZO_cB51jVIV^A(Jh6{EJ!8x zQ072_Gd#$}PQHs=2?SWG1e%jMBMo`w2UNXQ;NDaA|5o;uLq>&?fSarl@b^y>{?GwB z6!8taxP3l@elzb$o#u1_d(J;!7)-ee!7qVKAr#|s6YK4IM*Vm}| zBrgKD$ou!W;xeHthx0yOXBT)=c1ZVxdjzIrsSnGFHV1Yp>+?CEv=Q@E@%@=ubJUPO zeQGk+9N<;EypE&-Se&M>#9|i%)5`Bx|Ef>Mb=4tj#)9TxBKl%Ws5~2zo2a+)Et3b5 zx?)*$&&81|&5wV6X6fiJ|Gc5#p~OF*opcoMKhIK@v_iv?n;Kxl?D(n4GG)NTzwqIw zpC%BBD&9wbLmlur(+L}tr~%5aTUW2GsiAMho^LKGTtfUpe%(fu5}>Wa1anK8gZ(!? zMbK-J`bgw*-FKWk!B#Cj+RNk(GRlOb6>nRCIv26o;c+2&)NXRv$;};Uh|)OFZhC-X zne_EMW^*X~f=u)Xj}L0?(VX*?3j-{Ln;4w7B*%|b%DYk(k>u6baMJ=);Ce>unx3&2`o$bt z&mv%i^7N>W7^F)e8Wn{ZO%G{Q+Ihy)hg}}jon$@h$m#?bOpoecj21_W0+Gq2o&shl zHr=fS_z_F)z1L)x=Meur14>ff0P?dcM219YBeVFEHIeh@Kuv+`_TB)tUE^gDJEDZ1 zez_^M{mu@JDz*g1G2px0LO};&C6Qv=ZpDT3JJFTH9nD;4)WKZYroBpw02IxkY);Hk z!p3J(p0bo_po1eCbH?|rk$bc(wd3Y6L2{F_e^YQzUZ1GT8it) z7IYW=t8w_==m5*++Zco$xl{kM0h?kp_SSsLpJLdoqcpwSD+0NC_-k0nSRk*ug}ECW8b;Sm$YEwxy^aR0=$+r(RNS8ywlezUr-$g*W!xT@CXnCV%Iz8RYo%Ud7&JNls zr5K>GTg@`YJOjUUt%_k?2*CxC3w0DXGvU?uR;qiK0%43D+e0HJA5;`}XP~LZ8XP|f zt)+Ff!6NtGVUU`Ib?>@KqLYF6!L!HK1EgMv{kX6Kbb?CpQl4ueyfmKF8V%ipb z7wU82ZZ`ndm7k(VIDF97l-@tj(vaHMfBp|rd+HyTo^pTLKVGu_P3C_bPxF++W5=dQ zJ>JF&m+oe1tRu(_L$67~kb%1amn6(#CSRcMXMS%e{lU}NCpiQ*PQ~nZ*GgrKO^x`XWxTT#!hfX*VffyBo;L$y?FDUHU(==at>cq_lBfS z$+4lG2`F*miaV8I7+#h&9Gq~nf-_>PcXrmqq71(HN0OZBaCV^Ady#<+%u%KJ6{N_6 z@xjK7fy4R`b=p1&2$aTNC8viqFa>T{{5Y`CCyLf`Bq=3hOu(0M8!6Q31{)u?7%PRF z;S0GnO!tzH!n29f{(TR1kz+{Y@OvF^z(C1oLT6-%GdS4J(|+~Ew@xVwzNR*VXFAq{ zq)7AAu7l(cXvob`@zdB%kwa>rcA-z%bzUB4B}d|&MHk@1^sUA&J0bjX;mGZ!bw_V+v>eA&a;sq_lz_A zVoJ)<)Oaiiao>j8{-%$XORwPMj8JXiyJh(IO|`2DvOy^1>S)tquPZq9$D^D3u{GGW z=@*gYmj;CyIYcRFE3spY4!M4NHj!ZQT z`o}l$eG>A8)Vsx|{YoAg3_57#@ZSMaPXX(nNUkB}XK|G2aEIfh1NvApMn%dSV2SC_ zaMB$NGX9t~RZQ_k-Matw2Dm#9{Ovk!h>G^ib5PJL0d1b-(LG6uu=2LI&;4&oSdDM( zQqd^DL$W6e>pMKqR7+6Irnn`DVrg;S%rJ-Fx3qKQ5en<_yP zCopJyQP8y#vd|He*d;;L*i=1b>X0P|4jmEF-c zsLkrBeJ?jDck<9ax#qV&HvE|v@W-MY=R9M+P9kW4QKebYdnYu|p0ViswhuLsy3j%_ zUOWR|P!p(Q_o~JDd3-r~q#knD!2!I7#Qk|Do&7L_y(@?B3xhS4oBXhuWPa3W zZ#t&wJ!CLAED1O`J+hBc1p?VuTTu@>7oajPTfB(K|63g1Dzq6WZjf;hSqz)n<~; z*fg+~X&BOt{!KIb9F~CDe zU*I(uClRpvHLSrlo{A)}!sZpq_UDW<+SEvG}ZXG1g|=JQ)%N zK7YR**tf?X2|ar177-eTdCp9*B%N@BJWjV6_a7C+ewvRi+<2!20*;%V-=~*~)+=xE z%Ac&j9Zx1g8r}0DbAeB_w37wis{M9%@z-d_3Z;*+cl-r3IMTxA|HTEQw>MI_#_GY6 z!jrBA1)5mYFTM3$92ZpQpSft2zT7qIsQzf+}*u$}=B#lIVB@5KWMF7_Z@lQUY zsgO$PeD#;5B>b5|p2VRvhnD4M_cyh2VX?~oSQurE%sCbj`w`9mdBq9eE#YA#{9uH) zhVUs6oNt0ZOK=e@|Bu&2aAXL+0nxi9`d~y)fymDixh^7aK-|HI{VcIJAm&-(oi&Z6 z{;|)q4aPmMJkk+bfSF{Pw5ile(Pn*rWWGrlcy4k0A^QM{2b&a|2>xq_Y1BHmKFk&3 z25PaT`v;Rj!&9r6mm}6_c;_(s@M1S&Xv%4V;V_rjoT|n1lP<#* zYspZIES6`RKbs=EXuQouJann;AFs=;-u53yhGCFgp2RnRx9(E4*{-Uiw-;+w%d6F%-7t|=qMrqKJT60-p@j}z?F0nGG%m1 zzT?&0I1gA*V)NgAXXDN{J-#+|2})f*#zes`4by1j_Z;QdMHQVVYiN8gqPVHBuTE?> zxP;Gj<%g6v92U7Zyp($ePRiv}YW*jM$mFt{oL0|*fYq4>Hj{r^&Q=!7^Bjt( zo3Oxjt+J-}6WZ|p>~pPG46^Xl?!EC*6IJl}$mhJ-_sJk9k6+wOCy#x7&Qc)*6E0^$JV=;xO;LC z+)-QJDV1P>cs~0n*V{GXB%=%6GE2AN`&rS~jNX3eqwym5ntCl(+>{<}87zgehd25* zxkFLdp~o!Zr2pq!ISX(0yf8pdX>9vU_6nF`xE__m7l9)f(wr?$mElW1a+F3zWiZm^ z#4)!+q2RGBt6Aa&i0W2O2K&T#;XJR))I}ev@Y@OHOHWm+VDG-C#}p3OfEHSDmDk7( zJ-r>0-Bhg$d#qgt8^7k`Imyi>+pK(;r^?^L%I^T4SYOIG_cjKZyZr5TeUym;9RKS; zeQ}UYxBX>bnHy}YR8^MD&cSaJ3UphaWkb>Lt7U%@TmX(cx}jU)gbE9BUAw z+*Ia{G6)BGqpAPAp}T8~|M~0+c#HpemcBg{|L6Z;6Px`$bpM%y z{Hs3bXM`-GPjZEo*^?B`cT{2gH1m$fAz_F%?av;&JH~MCZ?U+&pC9b?{{HQTWH?F^ zAsI|WvT@wG+1L}NF|a~ltTe7E7`;900(G=%aCd8k|BAjO$og_4qtb{QHEm6XM$Zc4 zxs+KR2_<1{Tu9yBT1^d|Lpd7{^>Kmzpg)4_TBd*|xkfPO$7T4^L7WPBn_!AJ{$D~3 zg`jRNznha5AG(xtT_&W%5p?HAXKk@xg`x1+j=dA{c$eQHLC)*CP)^5f=-Gh_;M9)& zM!e*<;EX|#oKJu&dezau$wtA2C#o&D=;9<`-u2*6i}lGay{{G7d5RldG&+;2-zYeN z-}Am~o&Y8A%q&lBVy79J7qJe0Ya50wnnJ%{cM5=xFE3EmZ7JYdx6oqks?GB|quyIy z)UpKKrGJ=DSJH&XcvQR#7ssy zre)R$c(6(OJoF9C7$Yx*u1yn00q<_&)=UlSZO%8B1hwQgf3|Vl@8vcZ(Id&qZJXCM z;8D89D9Qt!%^s=gi$gRf2S1Z~w{ZCKMT!zBN~}IA{AA~t71~d`=}F4ZLR;0`?OYes zk7)82RdGkfFITtkU|6t&?}?M6Gz!ouKkj??Kj6`NTxolrH63DR+Tq$noa-V== za|`aBOAg3IwrlO3n=$Y?npJqIRtq!ugjfC7ZVK<5HDB}JM~>x1&JWm3n<3*({kCsF z9Xx5bo9$Jm16NYRqCFUo;`2v5BZT`d;aPf*wyhu;xZb;4puXWhIF~iDo4GX|-7#lm z+_cU_cWc^a1Rh5MW9hCi9_?!Ilc(FduCEa^4s^*ryHyR0-CWyUMB~x^(7lUgE@j|D zoz@a_!6>GB@c!O~x)$(bf?vP-N+^jZ{`jSe*f^#vbF{si-wrsxYVfe|<)bbaTGD#? zCAQ?=SNFWI32>gy6fH5WLhH$o4*gv3!@?2wT9N`X!T7~#r-;HjEOF<+y}vwVFy`XE zgOoo>c?H|Ilr5hlfk<9fk~dujIwSsQQu$Xaws7Qj58`tlnkGZ43wSHyf>3+c*9xMMRrBvJl{ zN`l7$-EK9F7ZP^{b1!c!ab*}GrbVTfI;kYCX~NdN1`lgE*3m`rZXpk>U9#rb=PU@m zt@fIBQib43?OmVWQssk}N#2s*|I0#OKi|#2^5q(SHGfxDku?CEvesN(FseY(R_pd9 znGIMj+-0)lP#Q2l{XvCZUjj!8Rn;9jZVzwH)i9+ug@GzAYG9fd1K{CDoH|1m0F-!$ zSGMWkI)^~De&#et?c@I6owr4RYT*%CvTqjHeZ<83Mvx7Vvqx&@G`z69=;9PTPc!6^ z|7}w~+XPmOCdasb6oQ{8=uPxfv=Qp~*%)rHc7AJ%W+DEdH0(UyRXoqi5wRaqCbY zc-T^bSmln^ycx1ayvIpg`FD=MEUl0hHwUAQ`=Q~FrKY=5m9PACUp)p-nN`huduZb% zwoQqR+HgpC#R=an;bA2FV1&1Z@F@_SZ-PHda1jY!7r~Js_y$Dpmgs{KJq02^OXRwU zya90sBlfez-hh~AiFY>VOhvQ;{}V{Z8Bam(SdUM2{P1|s7>Qa1cy9*jw;(oy-)ncQ zNbRL37iFK_y!`FfR!1 zdv(tj=|{cT6+~r^@}}S5bY3}dT{>MYmO~kY_IE~>nPwt>Ej1?>UBdr54dG! z`R@+a@OH*wdp~RZCdI*B%mx&Y-Q+c8RRHV88Y!m~!%)$Heee7&Sr}75x3Z$;0~epm z25%hl1WwKmlEI-QP(5_?-fIId!0gB%^Ri4E?wn?jIe79ctnSsVWNJ6WGa5V3J+)3l z@pPg$PYD_VN7+5`Rg^J6nQ?p1VUNzR4BDc|=QTjRvCg>N$On z)N_!0cIW7DZaFI9)eXwxE=1F31J>27T+y)>y+>mn8K~kdr^O9*3}|83zJh3Ap5?(n7}ZC)Kbz5JHmp{LzZ$5Eu$Qe z=#GD1murA-uMcWktfzv2N6K{%V@u)v$-dvS;ogS8&K!!*p-JI70{Y8`;jig*A6DKI^OYaTQroEG!`El2l$T+*vDsALd_Su z+ruQT!w2L^Kb{`5Lzm&CAd_zjXlUMlU^>_X{P*d$-kB3A@V&OURMEi%=(!%qm{}B$ zOHuySkh_7Pk<5_tV8vOm{O0pHvH&Nv!A4^9ss?~-r;msi-N=QCKitaVsq^7F%TKZZ zjx^jbeQtE`UKhY^Gg&S!YYql2kL>x)s)RhfHyNKx%Ye4{&l^?Q;jl6ww=rol2sX{_ zho&R3*jVI`^?H&hw9AKg!bJ2zb5FH{B(D-;8R9K>yWt7Qyjj)-lbk{GyC4nyRY!2s zHo($;<1(7IXu*j!s%W{ieRp}9Ixw8~Nido-1M!+tcb#`T0o5;+A5xzrc8)Cvp$5LshMXGtp!J-Bv&etf;7-$it57FD^tw!d*W6PZR&Xx# zTTxxXiY5)zZ?ip-d>>1|>Ki|je$(9`+ra?sTGn*dKTPub*0Kq{sz}6!qYjJB@hM2; zl0$LP%UCoQTr!<<+!DP%VP@qe5CJJdr8I_Rs&LB20yp)UYv{$?C*NiIG~~wdZoh@( zSyGSiReRD!NoevkmHFhTKHhofhZ@BIiN8~|>@fS+A6!v6f3o-XVKjQAFT1wg5Uf$I zU){+i2J8}!`CKnIgl(l!w8=hUc(Bf8&)QT0G+s{(jEW=W=T!~wFW**0=65%(PPdpL z%6t3IP5YYx*4r+u3Xbe>*-E40T9Fc_J>N{van2fYH%PmW@cH6GC-X@9G57y@#R=an z;bA2FV5k1iTSNF12+lXbpC!161h0$W$Pj!3qIXO5!HAv$k)I`UT}0l1xPuYQM z%(KKhD==v7SbV+-20fhEeWTq2_z7H#WOc~}NiEXeHJ#~bZMW|it=1CgY*b%07*vDj zK=azNS}a^_Bs2cO?gZoooZcCzy8#c|+N8Ls4CLjsbSn0>IpR=ey}6rL52QCz-Zha^ z0DC>g+SV?cgAwn)A`vAKfXA$&nJq*M&2uqt^SYq(@NaR+L3!kQ_^#^#4LdYv{Pzc` zcMBcULtYgRx}m}oao6Qpy%8qQ+|JL!8R>6gV!;+jqVns%yg`M=-hX$n;N%bNQ$pHE z<@SMdai@g9R;K=7fukcBlcE0i&aO964-}7hVQ%x~NozM9^e}UF{f2b_GJci2tJTH^ zY4A%jg-M!0)2mhF{9)$!1d`(~S&@UGp&Khre**x`NbW0-D@y3+>s;L!MIThr`uylx zgAWojpSrW;{}5Yp>65jlJ;a{3 z-V&0#hdrc*#I;gNVbBk5pMkDSAkWLW_R#k}OqLGlFAT1R7}^iL zvLyB`->8QD?}Gv~YOX$?&I`DHJRqigp$%&Ox1i^# z8Hy?n^xqk*J%i}1Hi9C(lAtW(>AKA4C6N5UMl%C<5j?cjHIwCR34YdHYon`mM&Cd4 z7a6$}!Mh@n^mL(h@WJJD8_gG0@C?`1xr0-ts7?5Uz=VJ~XjJHL_^n$7*TSlcXcuZ= z(3irj_2>%t=xNgNdv)$;ZCB7+5j#(iz}_YCwIUD9aMj$Ayq7fL0UCQA z#HY<;VV=@K^JgRuT8~6Yjum$f$~xG&&ZFRoxK1k2{FD?2_rKUoWLZa`oTp^Y-}-&v z#bb@G!eUYQoASiTZ9j8EYrsEmC~|rK&u7;olBi}PY>Am!FDbk9S>TS#EfUv-y`U*u zeem3qGc)=Pl5+u&1CgpU#!7Dp(nLEYF*7hol3K=2$RZQaP zexTcRq0t8`nWcF5l5$G@PnV6i{a|euS}eD{HTK8Oac=t*Lg!gaw>jTb(Z+)${w&tZ z^0Pi_A&;0zzbPbM7oN}&QXz30Tv;(2U;p&jCh zdAFUP#pAh8hqy>Qg9yfusbg`PAn173zdKlIpY*@|Y;f^`qV2uG?ZaCNnJ26f<@_bK z6Mdm?%&&$Z~a7yEQC2jEddzY4!8$VEM zR4SZtw1itV$G#~2^?_fbwS9kgxMTg3!vC?RDPzjY3&-8RzrZZ1hmFruyu>Z1o9bS> zkKmBIw4Ge!qqxxOz^=QTqgdUcP(LW>1@7z%fyJDIn0xMt>*oVKSk+E*<yf4rlLt6LKE1>N>A0N6w z;vrs%bIX$r!>czRl`xWew@j3imSpppK$f~idC;g5{uG!lzg~D9=dCRiI@+h#bv;6b_9pkQJarai6mMNQ^YjcJKD%{VINBboKILkttBwTDWof@5y7kg~7}4R+P(Zrk^M7;y<9!(VmRQ9n83Z<~7W?*7g+zHDWpK z)^~pA5Wb;ew|Q;O4c}L;{P4il9iaPfkzaQ`jHxZL)E1A$1tNP84S6hhGv|_%-PQ>O z3mz;=c;sPMpYH)5O!EPQVVQl)406dJ$=8LeZe11Rb9oRNT|)j`PS3%3QqV@F#q%~ zfSpV4`&&cSy0WCsJ?l|d0pEAdF1d~<12o(Wy2%Scv0S;S>>H9#3n`Bkd@aD-L$$%$ zE_#sWc!+S{-{aWzmSVK6P6#qq%g765b44)%PAOqNfyk@t$XIwmB+|TM$Mk|M5%VQI z(cU>42zg7$YfW^-{^u1Ze7A&$k??~N-WtNEKybbZ{w%>oBzRo}M~2`V5WQQX4@UG9 zi2N*(>mu?7#2t*-&k}nBVxA@5*{ks_jWQnwAn)=2#tw(KU>4i_7fRz*V7nCZ>I(bod+rwz*Quxmg zrtjDE&s#G;l>N`A5T$b?h{XAZ=;JY;U1D-5KtyhviwNKNjzyDrUHJ3uNVP;Q4s=T@ zb&JF|fN{6aZ})C7KXs=Fx4%D-CfC^RDL^vKBV2=->WFJ_!esv00l-ni{V#7osp$Og z4%Yf&Z}s+m_N4#uZ}a7bz?T1HQ3|Cq8lKJg_s%-cd{2z?ErX?*8EyA&x#OZKU4D}Y zbx@aQ7bIqL330AZ+fGH1xL28nA(k}9FIqC+WbTW{ZVgPCspBCqrgh^m8HvyMOYR)? zV)z@#DI-qRTJQ$G7k|qSwI^V~>GcSqy_3*u#h&w6&?J1Twldl&`xeTTl^@Oydja{I zF4Iibm6E)CmuoJ6yN&uqt`!^sow$|y59>YQG$j|=9)W%D=GKNS6NkMCW1_uHYc zA$aq{gGs9 zPrYliU@3z^FzrAFSmzrOWc}n0in~t)FkA>j8+TPBf=Utrsd*up^Tq~;o39)y?>moo z_4v$K?=y#pOg(y_z#l&v5%}k4p4W_FI?*Nze^wf<4h+b`RyCddWa1Yg_3uk+Z%;EBhkDj0U&+JIMIWC2y03v8d80V@Z#m*ek9^z& zPy7ejUi>&Ftm1=iRP7t3-JOG()~?;S8DoZr6LX^_jFM25O}byrlQuj|=1aTW?tx2c zgDtDFNxet+qJjDDUQC%d%R`~)0svF@huO(9$dE0rbE)1A79ZfuHa@unvRj8sM;08! z2`I8DyFnZ@FsKaYQTu`+l`jwKucgA$zgMTo#T4L7^IK+tniII}s>QLGBNk|-U$U>~ zU@ma`)&Ifz5Q5`*-*~M*xWHGm`4%*vC}7$>sRtszEJ$l%&8S<}fnaxQw0fSEACjhfl0vvhRXf#1q7Ypo6*f<0NkspTh>z}uq4mBvUbkoEOiU7JH9 zdfm9sm*olR^ZSF--5sVEfqNXg7;zLtP z4znO6|3k=CNzW5V{Mp!2w)BVdYh{TFLV}q01=+J4M+WH3MA;Kv%Z>@JIN`e`JdA`N zjPTYFJ_UmFP4H(4E+WC}A~-Sx-+<`d5`8eDr$FRqiCh@y;IQ zxjM(!{RHl3zIEo*bOb`_KMv4^MFWZCH(~|d5VVf=?W;U@4WC)1ydiPqCTy&?52VSz z1zp~BE%fjqoHqNxa@%(cJn8;<7**-+{PTl#8$bK!tyw4MWG49(uyu!W@`n&cpfk8I zL*mcER;r+O5*HEP-u1DFVHTpw=N*{Dks;lohqn0!u((8Tz1iLw`5xP2Na}-Oj=}v< zqsbT~q*vWs>9~L>Da2;Bb6tR+_sMqNfM(rbdHW6~Wj%V)%=$9QD0X|!TxkvVAEr*+ zo@ZxHtF&*wv&?*Z-Ccj^K&G38F6=**5p#I{?x<(-ApOVphx&hH5a*{8#hG&&m{LQD z@~w+9Ji%OdR(_HmU)zkT{Wi@5-#X}Zs=p`YJRFPj-${gn6Df5Qha*mVHZRUq zY7fpIp}O`<`U#X6<~=m(d<&sZ2CS*1JJ@~+VYcs1y|7d_ra+aZ2JP;r)*Rl<1@dB# z{*=9FhDiV1ldb^LeqH3IU|>l-gx1C*t8UkDszJtr!mJlay7N-k(lQ(^{|=`Y{+I?P z=J|vUNH)R6x6GX~QMDw$mXnds4QHTsy`39pMIpc4Fl<*)Hi?%wrHQVq=w02SgJ5`S ztMJGF!a(eB9p6>Wc;N4G=hOjXPegmGN;2YCD9DjjO!0~hfIWUeh|-I-y-19;c4|jYJNo4aPpDrXh&|ORw`@N$5RQ z4s$<+Ju;3}xo*4Rh~>U7njSkEh2~Xreu;lkLenDi2O3fULCtIpH%fb=b-J>$8!oMet7KEVOySeNhql4 zOH76W$sbu;!Wmgldyv@Y;bt-AAh!vdW@ zjf!_zxPo*Uf0~u$pXZM&B~`8Lq{cL3;x*yal5lF`c9`=?S6sd}D*x2i6)k6~YUe+{ z1cp-Iw6xKf0Mo(W#jkcdpenn4=9&^(i13ONzFWe>Nch1BZw=v7AUNLyf0p1P61*;g zBSY{Fh~6#H2P1k4M1GdYbrE?3;todaXNkQ5G0zh3?6NdXIs178rG-X>j^4b6=@y>4 z-Q1Uked7Bw5_Dq0#LgRD3$ae9r{P+%Sz{(@8`=|FSWF3Whx|T#r!hjap|c6!js4N` zxyNS?7=@z-YGavm4F?fjY=u_S4Hrb2S^dwaaLRTm%gjOviGBL`y0#`99Y1lDZ<~ua z_IGD3a8)mvO$B*Ha`{^a1 zU*7po$FE?-a(j3(VE7IO_~)$7yEH&{z1iaQnKWeZ=UbOucr_MDIbpHHr3=P74|-{P z*5IX{F6jCt0fVTq-197MNWIr(&NV9ra34EoevvB<75{k5FQpUb^74k(L+Pa7@pyB+xn#xtxNk&wE+WN?iNBtq1O4 zc*}_LJ892qO3t63@FEu<{T=(Bi8CGB2VWnS@XUoTuO4FAyzPhlu8!=>)009wU2gf~ zvsNId^xNWccMxK2xF2}eHy(GjEbn~n7Jv=kMyhQ7mkkpWz3?h%fDe#UJSmEUy#&RrY~?C4M6n|ksU7}fvsSKza(=HlmY=WzRJ+!8^6j-n6;pQHQfA+}R$R@7 zpNsIongW;Is+k%%bm7q9WpYuD2d~NSN~>FDV!WcfN$%eK6Q7$C}uZB}%#npZtG3op(If-~0blTCx&N zE6H9V87IdkN%kfyTgfKbJA3bykWE$`4kf!mB?*<5ND0X*znAy#`~3XnZ?~J*<-D%z z@wnf!BW2i-PUmTs$-f3@;p7!1Qr-YFsU&aT!C-LUOec7J035j6`qFaR5%k(s3RY;k zfMng54BK;7`17#G?mHq-njv|%B}En=jxuq3yY7k(Yc5~s*+YXS$wZBMtL%X|>oP~i z^Xq`^AOGhWGefjO&hTJ)tgW{z3SE%)d$>OjQ&?TA7)@0KVoYh4)I7Gxug~d43T**?uxWP4=cN^( zOxi0%!)t?*Kf5;tolC*Rg&ul*C6X=p8YA6^3)1?^fRRcLnCskDKeD zB`$GO@Y1qQg%Pp)UQISgAeDeVYafrR2>kk@^`*fcnX$N!)B0*)vBUD29a9C+|BHBI ze!VKP_Kj_?@UlQTDZf7Ls=AI`znKot7wbau`4?@537PQwIlWo05Dh?)b12t(mpafo zsy9};UmQFQ3YieDxd{*7=A4`gCH0AEPQKZDUl&W7GDzsx8UpW~O6enybWyccs=pUVt`1Mi1iO^<~ zmK2>PK}|437RDy5Wq>#@T)V0J*8=E7jeOL!QU*`O?;Ji>cN*pFNi8v_I)zMogkQh8 zeg^RDtqYkEQUTBRuYdn)XAJ({``h$7#|kt!d#7cA+gI-Mz$*AV5WgB ztLlg?{GM*aZ5wEX2dx}=MT*WKwtGxnA%?Q(>g4db7cr{9mzQskHzzCUK6Uvmm;!k8 zx4moViY)vjLL0Tb2|=Gh$&;HRGg8+a2M@<<3L=fE@N#m0UQ|2WbA(OuB9s*M>{O|< z!+9%qY`ZM1VEvw}v9IQ|K`qUX$AX>e$fs}RYt@({>8oacTGSUNRPWMwPCor09AL?M zr~U67=Ggfrb<0s5s;E-^D0-uWm-ky~9JLllelpANh2#|gqc5duOcysu91DK5;~N8b zt3NFL>oFagHO`OvvsVwjj$3lPwe0afuQ=hmB|MCTAB^zU5IzNi|KofU{8@sFNbtG{ zjts#!AbPh%AB^ZJ5cydm*G1$Fh&vdupC$GN#Cw*Qv%gmZ?g=^W|XNfa=6sRWz`HCkCM@3mz>k{3(l%sweZ zZhMk4_=^yDNLP`y?FSRHzg13kND^=+ekgt{#SU+CzIWYxJSG{Y2*#pE8IuJp zL6%vP-!`x7>aTUDpLA>>gKm(Q#5aI4&RZ>{-Yvco?m3ATmfBciO+S64Is97Zf$wvz<4<@?!Ekl_u8P;=e>+{%}wlaB%r9Ke;0V;vRO;ZNFz<@Uwi| zp0j!lWzQ7%P~v~KUjok5DIjkRit9qZ*-)0LToT`&(_nbVv_ca3X)yaia&YEpHx?I; zQ+x2f8#9T;itUPdj9<30#GBW2jjxV_7<1C*fS>L-F_Q6G07Qv^ZsjH~wLM##QmP7Z9Rc|Hk>(2|SQZ zKIt-F2qlHZLd|nXzCgiSNn9t~@Ik6Bx|4I)z_rh}wyuc!f*G5C8_wLNuF-wRvC(7UY^XW;=3W&6y}w{zHM#$S62oeB&CtvfFOi zB~S)*-YPuF{ml;?dIl-1*TVIP8fJOZt$8q%S4><(2=qX3v=99F2$8UTm9 zf=)gq?G-hPtC)VqUIMP~#EZyd-GQE6(`4qu5d84#FX+P0 z;)OP023KL9Me+ArpD$wT&pf}AOk=Uro-$#RHCJdPt#P8G$rUJ-1~zuYlJ+N2Uy?qb z@_~&_zYlxn`+=ti4&L|J@Pj&kPK_it+Q6v=-+2-45a1zgew$}A7k*dn-l|)5z^q1H z)7PH`0!n@QaIw^4Ncm^3V%7H~XbE9C{uc;=3AP(qc^V2B=SBbZCR4*~Phac2nBjqE zgnbyDX_djJH_1MtLY5@I9F+!RtRp@u|MTQ79b-%}mnP4o-;wcDtH!r})y8%y0>Li1m9ExaTANL`<1xmh4 zJ`eMZhNheU)~4Ky@Qhr)Y$RzOAIdb%s5o#5K2M5&R+pzinvWl9Na1FrueGz>~BievMwu+nYcZH`El`a8>W z;QH-w&22vzh4+C`X_B{Q$DGrzYJLNtb|ZdjoAWJ}V-vm2pPl(MQaF#C0jtgGf4nZ0 z0h%I?SxMkpSl2kh&xyuN_%^#^B=NGvk?lSh%<)~cA@vksg66MxVNa~Uh@-6R(Zl+P zUU@i+lsCXV+>O$tI~WA_Hn;b)q}-yWA$y`WINqsswyr9`;o1`iSGvC@@t78*{1wjhq2|rt4SU^qeue}eTnEI=xyJBsyh~BT8 zd!r5Ry{fa4ezX<7EbUboB6)bP1?rny&X9b`fwXT%y3(<5yMbdOStzPco-3uj)rAxO zUR8K+SAn3-f-V^Cl=%bj)gN7%rL!$V~#57!>-tau_FiIicwbgm-8CP!o}&Wb1E zT(_sqjP+h<%53%NrOTCgT%Mtg^-w4<^0Jry$`=jR_GGj$JoAT5k6#ZZDwJcrrK!~7 z#~Ju-zrCNVqAOsJ-XRfmHx)c!DIF1HFonk`*}C0+)?$JD0MVS36m#stQjxoscDV=;Of9rb`je0)-TDV;(; z&B2xwcey)b0I1ZckPi_Vd3Ywl=Vn{i9g?pG>y{z$fYKsEs@ zkeTx(IcVZ{pO5V++l&V>3{10Qx4(qW`q4FYmrl!4; zl#S@{OZh`o9w*ho3kwlno}GfAWk1Z#=-vxhW|CWv?~jAtJdqq_>!dp@-4c&^h69wg z5_(LzAdbhSZ67p+DxUjDmWiz!vnY5X-NDdzK%n3tG}#?@^P6@kOh0}!UdbQ}N-jPv^-Rr!?peZB z-JjC_=M^V>w}gk0@PiTF8p5YQaJ~utEWt%2cwGcXhTt0zy<4IWM)VYj{49~{BJu{r z9gNt|5_<#UJxk2l1O9?&nx`2`3~!_vYe(R^<9w5K47r$NwUp;@v;{oZWp(QQcMZsL z;+p)od|TWyYFW7^PvRzPz8Kv0ia(}ZT8H%JphdMMcH6_)@#dAowjWGvXYSyTt2-$F zoM61|QxHD_o#;rM@27Ge%JzbRV9Y`MKQ7|Y)kl}C{j7mw(btE$f?6nJc?psD1~|a1 zJCxMBg{;)_Nu)j)bPCbZ&dl%!(sxAn1d7-q9y8-bQmzXp?PQ)H4x>A^AB}m+_{a`k!zl51lm^^+ zY%Ked(G%#P)Bo&^>l66zm-Z7io~MxhI< zM-#jorLtT#&;b?l;i7aX7>U1|y}{{FkLu z3vqb*XSJHC1bBz9@$P5gTxfe;RB@|146Si*d7P?uL1}_?Vx?hG_?)jo+ckABxFXkY z_k`q~C8uVMd?)LTY+Do^D5b+ud|=j(_9bt`emaTr@C|=RG3z4r`(ZL(On>!TCCdTM z%{WQ&Z$(4O#uPKUbvrcW6zKc0zzqc-NzO0$?Tpr#8#KQ94BV<`i;aK4Ga1?4c zhZEX(RUAHAqV~uM^iVx8rcu5EKjrzn zT-y~2?Js)o=N`F*lgX}G9XB$95C0u@CXXcLo9Y#y$Q?dB{p;si9UmVy{CW0^?euy0 z!JloW_ks!hr~9r=l+_Ems4|Jo-HeAKk29~|Q7(XA@>ef4d+Fdp^2Ygy?#pOrMa$I< zrC6-zLH%b=S`E6hNBW3=x`C1&axP5R=3>1TO=-6e0kEXpE?{6h620n|X%*V2!c+$B z-2(2Ba4IX|C7XC6YO%OmI85ne8#{}8SOF>cKS&ozbp>3vv}8w(B4G?Y+ckO>d#rm#J6XDj3;Hh)&Mg~X zMcrXu6;V|wp!oSjJk>%46z^u3-*YMn2Ymf>bWl?pe)89I&K>1JdSb;te#!{}|F|}Z zi55Hb?ClBpU3}JStH?Ciit2EogV(OL6QF_Lm1LOl&SK3We)rzkHoS- z24FHS7*lh_9AvyC9aG2bL7+ya2dlmd#ohca zNch1BZw=v7AUNLyf0p1P61*;gBSY{Fh~6#H2P1k4M1GdYbrE?3;todaXNkQ5@t!5- zES+oJK{mN2Xysk_+q<_A>mCwi3MB1YWZp1unRox{pf&R5clNyrr)NNDj&>qE95`5# z|3?wKUTlylO_c%)`c?n=ZmpNeCr*+)jAdHq+-n@jP;dK+!nU`DR_)44rm`PgUim>b zG;N0t9Cl-_I%jw@aEA?0V`?A!J2To+bi+z=(@4Pd;cO#bd@Q6z98g-)sbEbwlk zISZbjL*jp;Ot#;%4e;`RbJkNTb8qL0306+&dfHkkkJ-wS?alL_W0JmfUqjV%Jl|nK zF{U<*hstX{=$Df`8UM~RP%91Lygz6BZ0)`VGyem!7$y%IfSJt(#fcoBasW=7AJge&AnI9B(1E zluLD4Hg*7MO95stvptZ{peIY|$t0}q63gsG;_WRPI-RS#KziQ_y?>%QZUR&*niU=% z^Cac;c$jJ>!eQTn`(^L^ovyNA*Cd|NhF*~NQmS@0Ip z0lzZF*3E%h+0--ZPrjghwY~REu^-C#HwP=uJHZ(L168N2eBr-?1MHNAu6XWSpIv8h z6m%-fRqu*S!tcv2Nve4XV%ZOtq8)sKqBWaC^CH)N1hyZqGgz*x^Il+ zT0^KR#qrLT8xXkO~VNZ|`&)vcXXYcBQ2)ngYE`FSt$m zN?~R7(7^S}VrWn0+w3m$GeA%Fm<9zFfz0Kb!{MwPP$5G&WOvbv4r=8RGrtGyXdhm= zA)%&(K7W1eW}bf$3zG+pRXn}`>kFD%8gn>t=MVco(NuDv_+j&Fc2Ns-{JO)+!~A@_ zs{dxlvD5{={GxeoKu8a#%sJ3APs=0PKlbuNDi$c%D=@@((t^Z;53NX>;)P%IbtT+* zI56WRjfm5kf6_@+Uj+{9ypf(Nr#+b7%Y&R4+YV|As-nZX-AWesHG$rsf$ZNM;jm<8 zsAO%`3Y!aUZaTlZMCy-)&YVp38Wb7b?>s3F2DPWWyK4cKapxEsDW$6dzca^c*C<0S-KGTXN7!4egU7A&=*ONQq9)og(!V zV2-Tlc77HY{0pig<+^a?ZS#mh`eTUot<`$aA8yc2F@NCoGkWlb8PUf-Q4qj#Htc2ji@2ys{Crc-E6D_~F87BnimD72w$$_t>eS()_`-&cwPe7W zID5NzwG#@ZCb(*8hLHMTpO+klEYZ>1LC@^ZDPdaPS%z1v`uJ-2XN{SI=OD$cGN$WZ z*Aca0tdPeGdq98rjg<_OAw%8=hLl$Ua&-lBK>pYNA zn*LdTw*|f>8T=OU)59VT%CBRFr=dsSFBhp^BcMqAl68yT1)12N=D)e23BIJHedS?^ zg#CUL2PpJ&@wJgxZ)Qo}(B%Vv?W(EvLBHA$Lv|$ri0b~=^Cf-ui0jJZ%YQHHfZI8E z>dHAM)NxO$B2PjZj-M1bs2k{t)3_9uG`?Pfsz09ud43Vc)v6K_w52>SfsV)TB)W-3 z>1v+rF89TqBdQv^tFqt*jmmEd^9HEmNwK0e+6&ctsVBJb`Cb7V4F0sqd2_iQF=p^=8(58nDwJn1YqZekkSf6;tF z&Yb{UfQ(Y$uQnJnZ0gYSb^w>iM^NuDN&*)d>MmUise@PcDN;s-A ze2@Lj?uE`ym>VN`r9@31k?54nmKVIyr)H}lSCWmw?D+2pvbr{GL#^i2P4bf*VrP8v zzT6MQI_}rA;RpsY49>Is=2@hCg_g;XZW?}16JFdkkpQP+Z)%uL=E1#xY{gPTi*U$= z_*a3oI)FD%c>H~b7MM7}N8{N+%FQondoh)k6#R=ppJ+bPlRu6?%MH)TUU`$=`GX-&`e8nZb|}ETT22T4h&v%!Jy-x&<}1gR zNbii6_;VwPM+@;q4P(6!k3y($K6!TrxjU(Mbwo*{pTtFyol6yqb%0OVbIjDs^I`ke z@uT|-NL&U+_N5=D4!~~jfHr541}41XgzuK{FcN++!dpZ56bQ~Y!Jj3#hy<^T;K&es z1EP0J^udUp0+F93a$Q8;fVhJZ`&nXdK)h#(Ia~4z+^PFE4tvL9`0wtYf+N)N)-pY# z&~@YK3N7CtyfKkj^rzzytoC{`>}q=hFlTUmSW~XW)w8O9T%-FT-GcE#zfvezy>L*x z%C8a6Ds+OGZyiwR!h@a)iZC$t=D6)iwF+EG{g1-_k}z0#Vtb6wQ3{;>9_sxgA{Y0# zW>JK_t;9dL40(P>)#=+uoEayvJ9``a5qe zq_|m_&M>D6sFr0;oV$YJetn01EEa>3gD_MMt`L?j$<=k4UNd~;Oafwl{ zxdm#NtXs(S$w2#oi!KzWlF(7x_j}rfm7x>K9N0V?f^GNT)8*IGN5Lx6Z$+ZgV6ULt zpy9JJ5>F`o^~&c&kVnf(;@M$n#6l%tC)k2RRZ5w(`E$VV2+vKIeWZD?KehI8T_Q5% za=N+TDuo<+nWILO%^{aH7i@?39-|;%OTT z>zqryxoHwH`;M@zF*0j7kaA+Oa1V)VyZGkrns^LIalB*O!;K+JkyTAqL^BqxUA&vFGFwJ#%k( z?2e5zb*>P$pXO8`Pl-pW@oz63V6{O&>xZ6Btt;RY$#nQ~%>~(TWVW=@xWIos&-baV z1mF)&m7Lur`tV0G%li|?(m3XNf9&ILTyVe1e#QLT>BlBJK)ZFTAe>Sz0kFD zDXcPR7ahK9Xk%Z!ZhJzR5UN$0R6YqZjqoB zV#gxq!4p%M*R5qLaZe=hF=cII-1CU^{eCbZ|2K&(((|$A8^{6WPan*+yy=3s!YHew zFAn2PL9W;0+(jVt=zy>ua}$ipa2M*dBkf1(*O?Ep>7c*1!beY$yvTuFRzc!)MOeO@ zLCbaO9u$-Itw>7HMJ?A48(O})0`z}1GMtRPkL@_F7W}x}gh$fb-+yuJgsJB~(5vlm zM62mq?Dq$45&KK|xrYi(m`dARCSa-!Z|v?2_xA6G4=2{&;V&d_`{pIf9dcHree535 zm?I?K(z{M5MxKTZU%yVi(4GLjvz)xsmogxGvUZ*oZ!R{UJUQ;;>9MbY zh}z@sL~wCFV(jVa_xniVcvP)UHh-{ya|#W+-i~KMOC9yyH8*Zz@!ikz>&r7R-N#b` z*Ella=O2Nx+jDm340%#Qh6~J}KfdQvR2;m;^y_RvcOCo_(tr2dtO~Z6exnyMdL1N3 z>(JB-=-?g4`NV7L6*1uzCw#Yrhmr7u5#AcYr$BJN3H~g>MI?A#1V@J88xXx)q7O#& z6o~vRk?SJz2E-kV*v}Gs1L8eP%vlPMp6U7VBa}E#FD3HxJ@la-vyAC~1$6>+)qQ#g zVBu(|xrRwB(6NaOpD$>{zN~Z~YvNwPn8n@3mTyx4Pqs$sta%qMa%|meY%&aeWuQSL zS1M4y*m?b@A;Mq8BCh-^a0MBmA0(e&@&LwjhwN=w%P{>jn*yW1jrjTjzD#GSHfR&! z9;-?b0xJJJw0l~314(mv*nBXl!|Ybf3**l*j$L{1m|ox!+(Ul)@a{oR@a%!^D`^F9 zpe8rX5K-)kMsJR6-#Hxa*Euy@W3N|L)*-PDKj)o->Bcllto?q%5QDy&yZU z^5F5?uK1+=`2ofse>yZ~dXE;L2t$j_cb!cvw9)1ZwigQp1+XsfRG{zQ1T1+n`G=aP zHsoWXWv@~7L>x+*9XTit*j(wIxf0w1I}O5U1}p3EzH*hJOgA4GH;@(38RHAm%pSe% z$apx0%L#7>0YX_1-rfgY?~$2?ZR}Fp$mQ z8^8T0*cfE?N}}@voOoj{b|Gc}$4nfMJ$*C*EUL3%*HibQmqdkG$@>>LRLAyNn2S?} zE_RMrR^bOo`1F(`N+*`8#Qe=ybZ`N2i?Bpj&`Xm)`% z6n-A!Ay3~G0~OShN^^q~F#F64vt3#i;8W*6?Nyaf#7`;mZ9Fp+xHklLcvV;6v9VF- z^M8|INY9Ty^~S|;Yrj}Zt5O*j4$m4JtR&^gYZ71a$A+RX-^XNZhPgZ56w}$X35S(vad2x zX){Fe3hGDjq}}RI1iu(GTt4}B;-jzI1U;jBpBHU6r9)mD)q zyP*)jSCrdcPA@hVDAooj;uyFeRXc;Rvw0$ORR~~ z)qPMYF{!aG_bDvN|M@lT?l4SI);V*TCl~(<`dIGA9t3O(UsVVk%7x^WiP9!~ML5ox zT9dr|4%qX^CD1LQ9*SMt{gU%RG1ip2Hr}3G1H`+NuXE`3z)TCXlw+V2({&y6iZQDO z^hZ^y_WC}DyXx#+{w&0xp}PEWTHPQZkbH77!BZX!vgv=N7Ba<2!KT+o^mXwOcRhZ~ zbzk(|hk^44uP@TD<+mL3AnkYCXwzxN1MyggWM=U$ZTL-n;(oWeG58X%A37Wm1jN)# z1^5LX!koJjmmeG}z;5AuzITp!z)x~_Tz-1mgA|wEZ_cM|PHEbcaiQ<9Fli9B+obk2dXwQ-M^Axj-RUf&SN=1Cj4se z&$4Rhsizmp>=%b?cO9u*jy1t-HG6;|yEFX4FVT>oejADF-C+*qAbHtLKabt#_W@Z4 zBUhH0-N7*DAMsP9d-aOdnYVPe^zljR-Vh^Sam=Vnq4(H{PFY^|$zgjZ7!^eA+2CCX4mP5Ml^_6r#zXjg=TVB#3OGW+;I^Xyx;HQFLjagJZz@a>LUW^-8RjuJ?B&Fc7Am7ktSzng$!sFfF3v z+4#vj{ZHyiVMxSMdsl<86$))1Zho|wf@0ccsEg#H!SNRj{ETw}@SDK8y}5TN^q#sS zz!PeTKPsyvrnLv+fBAx!8IJ_Pg3T|c+k1oRDJZu6o~0ff|8LIr@tcojEIfvML7S)d z1gBwJ7244IN5jC>w}dC3?u0=VPSauG%eAiFt4-f5>~Qzeg~ZBZSKu2tp~+s|R)73{~ zn}rWOc0%?jZSO_*rN}~gw(9yF~D(Be4-=yJYdbH&U|>Z0wM^aEE(ky$NcPc%5{#?G*67{A%Z=)hlp( z?QPwo7H917$RhvLPdS)gZDVnWgA%=AWu})Eyb1(TGW)z(i=lAFMTT9W*_dbXcY{R9 zRVa9WQ1QUBHOM*dV^hY<4qo!xNO&EWMPmE?M-S7Az`84C+Rmph;-;uZR}uClY3hQA zptK-zcyH|Q>I)%xFk2Du{e8YN%J1xX7WCsBh`u1)$tV;K$LHswu3wGD)d|1p-puMi z)tKx1TF%hHL*u`)R*$=)$E;lo`(G)e?q9DZe8E-VAlniFzwPx^xlNsXg^< zXg9}8yyfHmBf_wz+0EgO@G)GSE4=f#;{jL{X%|Ru=?qgEB}WdkxC zQtaSrBYfPo>S#jK19eJkgB>#ZVMbZSA&1m%*ta5XTCw~H_taFKIltxrq94n*l(5vn z%XA!}T^<8CbRp+)_W?IhlyqEYRjLK5&KLVlzkGx%<{!N?I2H(i?R~?O@9x7n9@Dk= zefROjeVS9M99}@ooK;EW>RsqF(VzIUd~DyEG8UK{geq5>x^g;P z&>BtZiz749Xpvp=go4KfB>eMNPVuq_<|-|DILny^Pf))e7QJNyD~oR4P+UmI5eA~J zKFW#EXw-s(w4a5?VOJ4pZvexVsq9GaS@_xJ>Gqt3@(l51QCWgm;Cn-XjI#vzw_Kw- z<*1Cs8E@fqWdYdzfKiD|T@}r|YaWm!-Sw0n4U}?jSYW~{PWWyK4QtBG*Oa4Tw7!v7aUO2E==on6tgDSys0ldSTQH zwRO#-B>pHPeGuHp!_3-6R%8o{-Y(C*VE*yyWHSa;1s*pfSX+R)ks zjM>?)FykD&c7#&oXrB#Wt>?3qdS?V2@31;j``pDd0S_$k{ZiaxULz@++5l^AUWyHf z@&E>NaoP2Q<{;|(()^4`30^pR_)Bv^DSk!2X0uh^0HX(Dt!IX;fvbt!j$qndj7J_P;UmH9vO+7 z66G{#Y~i{{Xq+OLxix5%t7QZ}g;4!c{9*=vCfS^i6EXnxzXgqTTICRR8&@Xmv%&+$L?9H8(8?g#V>CET# z=<5pT-XGOs`D@(3d>n=rJmSQeGeXl(h3w%X&+o4Rw$?~X&(Ue{CqzZRF8F)6oI@J# z9H@uy+TqQpbN$gX$?*F>zhp8#PgIlJAef2lkW@Rv(&t!H;8eAnR_@!^j)iw_=F;a6 zV^eO&XhpFLkd48B`2d3%_+3Y>8s#K|6bw6}#{x8v>MHvz|C}abpWK~S07c-%M2*>E z1q<9=9rt@i^3G}0m_co?k=_vQP zH!XdD5i-83l4!be9xI#{+gVU!3{^HVIJ_&aB9Fhm%pwB#slu=|wRNe(qq{yY18%{>+^Akfv4^kbhBDi{A`aKDJu`?i?k z`95cf;%>h#Kfga1%z*BC2ea=(zGb^}Z^i?WfSr%A8ih4l_!7By@Ovwq z`Em8PNrDS_T{a<+cRv`>ZiF16Y;A@^dF?**FG;@o_Fwmy+T6e)>-Dy~jFs@;g-qd> z7Xm>M=hHsBtX#~>)_;a`GZ^}C2Mr2rSt4ob+2WaAh+1+9dc6VRbqH*Nza8R?FIgj(ElH!3$YCLQkSSOoHgR^Zd8 zneXmzZy_Vf!2X44MWk)BQ^opnHl$AF3J>sz!)NZ6y=s_r!LEsgypKI}0FT{0aa}gj zo@mdV=OeZk!52k$W|25=GxKcA9@cf< zg^4?F)Lc)+JF`mOt15V)3cgi}8B@2Ogl1yn)BAdF zO=*J%A;nL2lthE0wF>T!nmkeKAgAdG#UkilveqKfmxfmy@4WxH#~rg}981uU@kcSC zG-qh?Y*2wI)6S~rfoMiMg{k6b6uO=IhC%ndNeC`;PL^f$9`QG zu}dk9&DI|~JawSz6%A(yUKpmL4sl7xp3MCj!CB03N9Xf!e-C#g$*|WT{ZJ@YJ)LCu z(ZmV%x;4Tip~S79FhH{N($t~wivJ=Ew}qjAA&TdEP7gSzPLa+cnjxDNQzE`QP@@+|Pv zxI21{O#=%|pXp&9(}Kdul~=#n3t^4<55B3^yCH{dIaB!K%Sec!yzf(z9N_&%#nD^p z3Z@h56`v`D;*WNJXZ*HqLk-o}pX#|B@Ru;7mA;oUFer!Ly}8f@v`3C*32aJ&xP=Rk zCa(yB^|yZqU;MEFb~hv)^gMJh-IwroO>HaKH$Jadcvume$47=SjU9(}*4)oVnun#$ zON{O9x$Lpl3GXDb6ApOfT2h~YPBfrs|*8+E7vhZVZAt6m8v7t*KIlB0KoJU7l99qN*&XPS-XYY?oQ<;%(F9!e zD#7Yk;p&KlEYK5w>yxo@1FiYV^wJgupwZELV#8r#(1vpIH{i5-?CSXTB-{f9Z6CnOf(RDM@8j#W4vJgaT%@a9wozUXoB;v zU$XPY$oz4Fq_zSK4j*J#;!}WJ)W0ug=TP8(;)a($C#zwK&97z~;{d+zY?k{|kOYIz zKEfH78=!a-`DFtKQVu1RQkg0=4;|m35}3%sj%OW3T;^y!u@4K=uJ5imLHb?)xh&4vg{+I`gS94^G0y ztX%TK%?1x}iQeFcAP!%kT{Nytd!`=7dIo=3)ek~%OM91+`R%}uck0qSSF&-8=qJq? zhGJ~KN}0E1oC8k`eS4R9#2<`R7_2K7U4tcYaof_PL_Lf6)>Ys z@%CLgNATsVb)v()K%o0J*uL$38V)I&Jx}+z3h!tgo}YhT1m#-fzp-4224!YP&(XHiQ)O?PB;|{Bw zTBa|GP6;^mx-1Cj36?iD#AjfG=5d2Gm8+Qd!Ot_EKXTxjy8i95!C3s~&zYRAz*vU=AY@yIZ?CF{P zky!ZYmd=viO$ZcPrJWQt zBG*Oa4Tw7!v7aUO2E==on6s68EC29DzJ?`9hCC;@wSZDZ`rgH=2E4D7^4yKgSCILB z(}RXSG4!&1~t24jzDHw?DHhFkVLJT(!u_ z`52I66ju8CzAdVL&N}Xl?Gh)lEaF0uPRFSh&aD_^K4Ouam|+E#e$h-vF<1arYu~DGf*y#`?aG7o z!XuwFM>hYWT_zqJL)Ic^-nY7HU~K$>=D^($c;dJA<*qZ{$nO#Lqbmhw zs8xHpt2CVS{kidZWcZnz_+8u4PFjD=zY=K5!D*nhN-PArk;i&VM zyBOnaDyls`G*HXxj?9}9W9Hq`!Knn>pO%k&P;SpvO;`2-Qr_+Gi>IVeX-n;|f-)ci zHhm~H3jCkLJvX0c^lHl@&hXmqYuo+BcOAjr&4W;Z``cl=8__`bvZQn5J!ioD$H&#+ zY$yD|la(-f>o#Z&IbgXESV8KgnUV!RO@jTIjW_vtZy`y;{fo6*nn>?WEUiF!7%=wb zyYlsXEB472ak5-2gi!&;SEsN%@b;AtIVvB4I;*B74utrU@=rWR;_w5k(Dzk%tEdQu z?g@(&`xHdtw6lI;df<%~k0%~IBmt?KJKKFWhDTYh+Re-bt~Uh*%z=Yx&??Nmn1 zNqgtBYx6Iuf)V+C#rLl|T!1O9PKZ%_405D7v=!P{1;-co*7C>;v7*N6?DI3>IOs+O z{W5e1c@|q5SF(~&ch_$hCdXiK*RPE>_E7**6{PiO;Y!1MS)??03lbp5dc_0I`0KFx z&RMS&ft$EDxq9*c@pK;kTzBsumyl3NgJ@AHl9kNELG}*Wdy~EQ-g~d?k&LXeayYV* zB$XslMhT%(p;Y(pbKj5e`}aqDuFrX2*X#9s{&PD5$!Ok*=n0Pm7AgL>LdV>IFWX4) zu2FiBaq+qoy>d9ZDCe*BO+X)U^mjD&McRN8Uh#XbXMKU(&u-haHZyehP~gzrcv6pu zEy3~My@C7|Z}WjYwn)XrBV9}@0q9w>9TtD5i1G-`=3&*my|y>OIK++4s24Xfe*_?mzE-*$<`OcCV2e$N>@; zQtdnPtkASj?eCaJy^uTgWC%{G#kny;1(B~Jz^ywTbAduPfxeidYGqUkGT_rbRQ&2L zEQ*Q@He+CzF zYILCM{1o3wYaMu?wCj6+j6S~S6;)v=5rBt(e9o6hCwU~4c#T7?EAa7J*#`$n`38xW zZh4I*Kd2G2Ykl@gI(q(NDNtLr4EstqAG{Qi3cFh%-$k!H^whKYv_DG|uGqh?X||#a z_G-&2xGUwM3k#I7QOsRfW}W*ty?7PemVITjb0!%z+K3+Pc-)B}?K@Z=UE&S?zK;xk z^x7PiIZsl){ozOIEq{9RywnC-#w?y_e0CNyA99(vb;JoU`n^tPB<)fAf7;7G;>*QF zM>eDOB*nrn6_v^FpDJV8l0DoNXFY+o*RAsJGV{qMsz1ARF1I5*ys{Ila!DOK7W+BT z`SU{2<6V`*Qda2i)fcCNNL1a~0p9o>-1N$Lw3dGji~ErL1;->pb_Gc3as z@#?D0b+{S*QAgoQ6qsnctSadU(7IWb)v1HRVBAS(p!kU=u<{H9aVuVUE~(19EXEIp z^KfjqztG2OWP>pp=g+~nL#25MwMZ z_Ec3ogwL-oE_{fu25HtIo+6LC;QE_!8I4nY(D#6^nT2Qx80RXSUvNQi=1yJNbF&tx zA{pkj92)~R?;dP?Uq#}XGfkxoOh+Q}{Fjnb?J*$fS5QM_Uu}VY>;mwTJp`I9I}NCo;_M!gO3LPiW}6b!OhbrI)RMQ47N^ih$c` zNcyFOcf~n#Od;HTv@rN47G-Z@erpm2x7%i2J$*@h1SUpSwMk2`tNjT|7xqIzd%BI+ ztizCsD4c98Re@hlnazfmSK+q1V=H^vGa=p4e?ENey$OHM-~IlS(+)hn#Zk3hVue(s zI?dikbAtOCm(~iAD-fty9XL0Vj*aw`CiV1T-r&8b(Zf{rti=~K`J!;!$2c-l-T+6ULB|HQx{n>H#VNQcDS+3_XTu(sYC(|o?mjZn9FvTn+ z<@&i!%0~o;Yom|Flhhw&tuWygCw#Yrhmr7u5#AcYr$BJN3H~g>MI?A#1V@J88xXx) zq7O#&6o~vRk?SJz2E-kV*v}Gs1L8eP%-KODI;rBULD*ZL!dyJwglYD_owzxb4eVl@ zO5dLBht)D$K_(C4z!~#z{Uxg!$U94uuCp`)Q#oeSkHwO{*F7FM#k1vLdWy>Q>+cJ} z$7=iNi-)fw6$4n{YFLWJrn8pb7x%#W4N)P>)>1rS>$Ndik_1dJZMnUIKN=79{7uef zhX(yLMUPRFcr1g{^xSOaxNcl$N>=v{?0BEAByl(yPnC!0aK|S=HCx^U(+3G~)NXz1 ztV$LX4H33gy-)%(%;}cR66zpF#}^s{+jhvuB7`ql4??k-hdAw|54PyZ^xtFAfR?;x z&K@S^`U`xnWCq^y!ZgtWC*+4~&;~oT@}6^TFyK40=9LLu-1Lm^y!-oHG+|?!tbBU} zzW1GFNEoz-ZC_^0L@4c%YQ?o)@3wL1&3Z1CttdVDbMBsxH;;fulY^DI z^b*)adgiNXj}AV%+4izsClH-o=BbZ*?TBV2{c1FKDq%NmWaDU@AS9<&dAyg|2K!lB zc(77egs;WUPRD;VWlT zZ(Bf{5U!}6qXU{($6Zg(--BVca!yX+Do~gwAiJXfd1fXN2tHNHY48pNWvrIofnVcc?Y14N*5|-OxkFuFNPct6_p&IbKmZ;W zp7<8zGS{85L#}jZW&%EzAJf0PGXbgi_1Mn8nTEpV$M}_g%)#$bzvi1cKSDG$u4kzF z5wc|1T@MhRhwD-98J}8j;F5zBy@KUe(06xAL7SltD0752P-j;hj()t*UdCt);=-W6WoYZ-q9C>GmMs09N<8hl|Z#0l=ytd+DNZKQvbP~R4(2FgW0foTn zUaV1grpR?|48D;de?WPK^qsvd8*{{wMU! z|Ej(bA7DKG+b+8ePD)4=OJC*yzl-EYEf5!2(J)@T*DjChSOTLuw>44C!JMVoD0Ni$ zW!XkX&Jl1$7pUvBL;&w{zYl3m8Km#iBl;{S)8OWpeg%ux93(9jQzNFHkD^yDIaQ36 zKn6dh)2eIkNRF%jT;0tu6qx*p&*W_b6d7sAa7Sstv5W1W8G4d$Qf_VdC26nCWxw`_ zF{}#on6BKcerbjo_8k`$IqeI5=8JmIIoP7>ynh;_?>Zo}?;py)>|Ycsn2}QvI_QDf z4>Lz4k7UEZJY%Lz<4Ba4{3`xvuNEq9kFk>c<%qugCITA;z+qYoA3 zGSOI8M$&tY5cI3rLhI{;-5~XuH#>X5708^PBFEaVi!)6hUiA@D_6S8k|D+wyC*=-e1uSbfm|jXSMZL_fPZNfMpTAoV9Wo}}-zn3E zg~Fl9)Q9khqtQs7;_eTMB}0_FaIZF--xA$_@$KZ|Rs_&!v0Ho`WQ*u)Oq;Yw{N;ka zJ6aCOt{``x!>foDTaZV4gj4lYBpU0Aw|v7;fc_eVa{ZQ$MB88LMO8$~K$y_!1-sHH zRKH-?oM7}E&daAp@9e8Y-(LtcDQtNljiSCIV|1^ew2H%5O+I&IH6CTi^4=Hq!qg?v zvKNs436t41rD!nVAiH@;H5JI%Tb*KPcnFJMSD%o~Nd=s?yNlBYO0j$2pB=Z2c(^UN zuilC)ip1Za7`tBNfpUh8xu`f3!KP!Vh0)I@tQlr`Z0ANUe4@lNnWE>1c4@I(x$-;} zEqhA34?GA3jLTcMDTD6fg5yUyEx%+zJ@b|B{Riz((b7v*uPl;3HSEvN^*za`_vS>? z%yJYAde-#WG9wV{Nv7*s0XLk}>SQ^*KNUpCvO*J`STuZNnkk7t42&(mzWR+Z6B#ia z6z;uq6W%zaD(;I?uu3DD+Dl$btn0C8p8g~qOdQ>16x5r8{+u>tNwW(Fe|H=BKeUfQ z#+U3rJR|vhgLlb`eQ5QCa^lW^p0SxgwX>|c%<|s&m3X{%FNvoV%${_I`g967Iv&FC zEyf=NpL}H==%ojWcRR*8Q^X=R)}0GQo3<$RW_Z(whweaRlTPtjKpdbrMtLi^-x$=~ zOJ@8cnF#pB+&KTu8^oSqTQ5@fL7AEBJ&guAz|Y3%zr8`Eg-cnulo(ogIz3PQC>xl0 zEYsepPKUZ~i*w{T$-t}ji&4%)9bBF7ykc*x1CRdrV4{%hgE;RsHaiH20=@9NeuBzw zi13ONzFWe>Nch1BZw=v7AUNLyf0p1P61*;gBSY{Fh~6#H2P1k4M1GdYbrE?3;toda zXNkQ5@t!5-?4R#K@AH2>f^yD#$El~SfcPaj)v14);k_kpliJe-xR}k@Mz7cny<<{`M;PY=0xR{Yi6VJUAA-emU~e)T97t%*QcT8 z9!fgn_EuQ1_|+!lR4qPf7muE z610B(jIR)>CpNoub(1hSaynTuP z)UmUVaJF=4=A3Xf$=kGf_T5V>Ahl@<-we0m>|}d1W}=fxliH8{ zul1()vEGArCsqIWx22(tg^`E<6h6jTjn2n*0vcg~SN~ubdj_hj3obtNYyeA|RRo?p z;0+SP>-Xpu_@E)jsh<3~SZo-;p78Wz5NztwJ$+mHF#dhX&p6?-H?Yy1OgMKg7fn8VqkQ~o>&r>~ zyW<5l>`In6y4G6YG3zz(J}s-;w$ceW+uCfBPRqc3&?L0SF&r1w<$nAm=nlD5m}oPD zw6X3X*7(Y#bMPT!Y!CaYBT8f)+%ozpgvz#M1+Q^hfxrvfvhA7;SUbemFIb}h+FYok zeZS(3Yfh_3B>C&Z{{06$qBl~3G=*cO;&;*>Iq!<;XzyP!tvZ&ejstFh|MkPfL%ZVf zd-pF4e`WkhzNN!VnAn1bx;m;Ts+#+>#?bQn4*L@l>(BN>pL{5IbA(-&%Qh&?jB zSsuH_D=75Xi@E;D9B_wM+=PHRS{v?Fr zt>5R)(DIKOz9EZZ@0rRqMjOgt+3=5$GX@d(@cG+rOy?rt`1PU#E*vqC>AAbhN|!uP zNyV&f+%d@Qt7*(FX=ikePsyz>*%_TcTEi|AX*g}>mxtITC%j)RG2g)|0803o24_^M zV;yaN1?nSi*le0YspU zc2sRqSD4y=c>|Wp2W|h}!93;UJXv2_Bb^zq$$$3-tdpnzd(T$>lscT~6OID^SdEMB zZ^gf@45al_4N!YI$8q6fil|3NZV9=N_GPs5Us^yoEE@l2H>j?FEw*00Zj8GC2(LKd zyCpn~gddFX)(}1gg7Z!AX9+GM!RsP8G6dg%=-m>1FrueGX+_mmmV)1oBLmWtGoz9vyH^hxCAW2bb?F8D)L!@3dPue$t>Eh`z{ zrb|u>JD&hv*2!9O8an|&K&{sle;a@71xbnR{gCXlbx-fwEj%I-$HCAO0ECTOW0Jxg z&;)C*hx&>=a#}O4`r=RGV|j=HlkG}8w8F)#;!qD&y?34%RK{V+Q@oK3t=dR%uGMJ& zBW=WW;Ta`m3lDJjIrJ*wv@IC_8Fk_xjzBPEQX0y*9ty^0xas^td;zm|%pU5GUg&~S zf?m|3C04#R&{lCR-dJXtQu4=)3#KOBgJ ztsWCP_tYGLv9E^T%()wY#_||#B}*n$ND8OoO%DNQ#eUsjI`pBNPSa>DEngfO{4hDM}KqK)FOi$j6;DvIDxBCUmp=*;f+gk-Y#K+``4IEA())UIMUX|*o zta_hg%W-Q^sU|EI_eK@HzkKgT4~rd`x^4V-%!`zx+c?3poURPyCEEK$EW=RG499uk z8ILw^N={v3_C@iXFP`1_Z3FH-DX+^ZH$$9vyWX9m?ti)(&Lk~<+@J z;V!oFO5dSiEQPWb>uV<;nS(?7972vox}vX{QsiYC#$f*%ySH3N4c_<_qtf&{A96i2 zJy&8J3anAoE4o!ZBpTQ7c}B<+rDonR3~MIw=~XV=?##=>@7ME-X?Hnea_8m8klF^& z>dwc}Kk!AFYaI;&GN!;KwQ2B0r5~D$S1B3`E{A(d+v)by7h}BezPxUc#A9XYf0G@n z0}_HXj)@$NLi%!abXT+N0AuxWXO(ON^jLn;Y4495%qdoCYO0b8t1eg$r0Z&6tEYcX z7%GI|I*|ph1G7HJCb`x(F2EB|85el7KCuOlS^G_rA724ahgv_$9dJhZe*^zgy<`X5 zAs;_K$|iYOURG+{tM&rHYReOAx~hOmQ6Mh2*$UXEeW0~FC4ukzOMbFQxrVqu7ww}D zx&asteEatfhULXm>IzpiKx503^$2JVehI$)_dN@TU($2$zCiM)NG=UsIxY#He!j!! zr%Q%=a=|~{rwov5z($$?nG)jd9z6J@eIN`e`JdA`N zjPTYFJ_UmFP4H(4E+WC}A~-Sx-+<`d5`8eDr$FRqiChhat2S1B9044a=Q|C&ft~gxaceNnq zV;E9w$h|mGV~7}xe1*@(yCYQ48`-B9hjwp%?Ws4B1RJN0dcWK+2~6@vB4pkkK?^6J zb`7{_B8DFMe>@ay5zWIe&ZaezH_LmBhpEgGHQh=OfIim1W+-fNY>E%l|1vNoi?s$? z+2{6qvAKZyBLmyH%x2ie@(D$hyee?$$T)S|#R{;G9SPaf6o@a8?)6f8*uZ9k?@W-d z0V<7klJw7t#O!T{g^sQXBA$DO0S_5|NW3p>v&^8D#A8_O!?oiO^vG|>x~)Mlf57$0 zv8%s2gEgKrC@c%W_wKHHyfzN^Gry3!y6pz?QT9fKj~-Z3yk(O?D}zo}D6li!_}0z7`TOVHsTpd|ljP?HNE7@1jl;V7w%T+|oa zsF;O;d%@&mejtv9f9`oE(m~>PcE30?F!Dl7OyQQ|$x3S={^7G_Qyxawsa&fR~JY56{E9RxRRgz_s6eRpmn*o zUOpoemg{s%9f)^?j&}wf9kbL>Y`8sNew+jvu;(vtcJxGSEXU7p>??!X)+PRTPKLwQ zkZ-1;i8qkhu;>i`iUeXh5Vv$UGXR|}eY`x9d=tvt6dmqmj3$*`4vw}vI3l~v6C;cl zOpvs=;AoD#D_U6Wd~hx{0}t&p-^bzTgar*B>zr{5hekTDS(}$k&=bK!i^}CBKcKeA z!Qw4lw4|K;>ldpxNX|CT?M%$avR3Ab$8Do=ni0PJvmps;Qs_Dy&bonuKeS#9Cwb#D z-=}Kt)wqsYV%I5OwYvi5qwDQvBrXEw{aXpYPR2~?Mt01H6kzK^DGH$y z3y^LWZFG~y1QpFc?)v(}5D9qmd=I790Yf&TjoelWsLac;qxp^?%6uu_q!?s?E_BCf zYKvN-)4Tb__Z!T18Xh<;vQVjkDz{b==*Y<6n9X-L(q{*dx!WBe^IQx0ex*4?${XN~ z2cJ*-01o8zY?l}HMQNl>k@WB000XY6Tg=%yNUfBd_un}?I-$@K5PKLG?p0*UL`%+&>Q*r{CKvw_Ji1*J$r0WGmwEqrr10Vl@Hb6(@YRgoly*e||8+ zTSNF12+lXbpC!161h0$W$Pj!3qIXO5!HAv$k)I`UT}0l1xPuYQMyl06yt8B8j zSd5gnwU0mhExBF>Z7tfiPExrdOEz(vQ;c4SqIXIE8dDzTlvVo5c;GfnmA|)3CYqGr zp(#tZv%hqGHUXaypR3@JE{4rD@6Z1bi@>?z{OP(Zn!tK@^8p(t zNf2M0zihQ)4ko%MPEFZeK+IE|ZPHB#0C-sGR|jZ(`ZCBe6;!;E^6X&`dQKR^<1t+dWw!<4vu+1Q8g ziX`|FKQMm&#dTD5#8Vt~xxz8AN@K+^Q=GBq1M`C<7C1bUd8h0oMC>Xx=0?-jsD$RP z5rd%tj$zp^LI1@E+xN8ayrwaQB_-3&-v=ZSQ%#e$NTCc$^(QO6=6@Jv9y_Y-yd($f zPGxtK7hc6P+FOTOrlc4PVGpQh8Bn2`K^}T**b!`1Ji;Q(hJ+wSqVlWG*@7jw=FAtJ6~Ehf(Ldhb;$9P;J$EtNA`Tp-OM^wy zMJK2`gVAX80w-hB9o)kkZTMCt0Qw$k84@_?kCtk8mXiMzVO==~i}R%(z;7s;`OaQx zWOmZ=!{?`#uqz_r5a;0%Xfxw5dWqi{OWLO?IxLxjiI8a(iQRIDT5GmTi|Pg}O49#W zqFDo(pMT`433Y>qb#_Huxusg zJo&osq73}i^)yfpsw2*3Fqr8g2feg*SVJp>;I8Yb>_;rDP`H0T3n^~^i~2jmNp~&xU6{EukWiOc5gkUSU3T& z(VJv@H(_fKl(fJ)$ES?~Ph~ppq?pwfzDMgNF4(9oPMiOJDfSQ4 z4{55?L4UiqVdPDz^i2? z`I!A{&xx{GMQk&__OoD741{TlN|4h>llnb^eJg`z@bZCnC%}; zZOWZksQw&NmIfMw#)Fg3!n>+)k4c36ENup~pFW`(eAOF_4JllDP0I67mmO=}8(<04 z8~gDq^4%m)V9y;3Q8S#UFfXT(dlimT7}Xy4-st`!db4k^l$4*^>)iAt#0VG+-**iU z3B|tm_nt^R;|izl&NA(1;lqf}7p);B24H zZOI$vc>nW|4y2%gPPhIJWlod@Q_fmRVp=NMi5jw-|{KVK7?G|e^5#Gb?5 z4f2APY3mZaIlt|n$%!H3>(0Sa%)E&A=;gM*Y$Q%w?72_gf+^T4Xy5J6Yl%?PwDQ8c z5;I)29=-2w>p3`?c!KHdZa451>Xi?LO9O}Vds9Vb1Q6Mq9@&L6R^T%I-#dD|<|JMp zFJzRqhMMD%TUoyW97wZ~4 z-AJjXZPAQfKb7UY?C-z=N>+$Wz5{y?7~~mdH{t`>82S)xTR{!?H z^mYmUf6UA9UmLmAJehd(E7M{9<}-ir`dVmS`$9D|&Xg056CuqfPL?`W&l+sSo8mRU z7>%4#5?Viy?q`-#nGe$qi=lg{((FGMl5k(f?ri3oB3xCx@@~V)3~>x4zHV4{Mq~|< zP6xFj&~nm=k(6pD=yuXc^omv~_FN|wx;rf)k5IO__|Ysl%sJ2XM&Avn9B8Ze|KNfw zxIHPENjX1h@+V(9wmcvg83zcRDZ@EjvYJYFec>Db=*WrwH0XG$W}K1C0=(Dq1##56 z;NFyP;{=aBn12yaq~c=(-2WVL3M@{;J;%EE?z{@ea`!ktb&6&~|A=dW2{p?2*yI-4 zW{tw3GlmEEmB_%x-pBe1%2KfPRlZtYgEJ0%B>Lv!Dv4LGbx%_Kp$y*f+3lD*e+|(@ zG&wy@Rlu*CJAIT$xyM1_Xa=7VO=K@+AgDrSh1YVH|IHh~SS6X`WP)ml=ElOm`&n$^ zwA$kDAPFYkZnY^=oH$K`XX&}5YPC?%^T%qP3UVRBL)C^sDtisj}5 zB?FRQZk)EpR0@t8Kg7LA_XLh7YdM!8;T0!*w}gk0@PiTF8p5YQaJ~utEWt%2cwGcX zhTt0zy<4IWM)VYj{49~{BJu{r9gNt|5_<#UJxk14MXo(R_Ib3!)Au`%b%x|)i_Zsk z$wn3c@-LSTc!wu$z`WN&%sY2DAsO1JGu} z--17!+K7p2m9_qP8RpM?@~Qq*4vu7&5#5=qfo5++$11i+-jsfe+1eTb_~z~rCmkOF zc*scegc#oi+@BH4@`IE+de)HoAiCZZQ~xx6x5cRrC)xz5{|2VQ!OEc=O+7BCa8YoN z)P-0etkHoEB|75^pH_D{wy9u+FZ72~~2Y1oFjl;Z>?!0F!bzr@&q4!Xp0fYy;e|3iZIj%32uAtGJoa+zps?&PFl!91^tsR+l{W_9GlQ?S_BY)(nh+4DX@yTN6fx%SZ{PncA3^lJW&v z)bqzhLh|A6Ps=6BdmNBJ%N1+SuZrNF9AjiQXE=5T<&Nguq&|VVB$Z)dDYWA!oB7Cn z3R#`;xh8Yf3Ne$v5A=@q#PLI>G=&RQ*hWo8^{Gk~ly42_iT-K;_><0v@8yn0$EnX{ z-@UB^M{wdu^OroVHTrYSCNmHAa9#^v&-MXQGr#iEsEiTko~af=?L=_wjBqC!h{Yzx zi{k^OuB807j*?hbHeA-v)@gl~g>RQ;(K7UEgZR<6?^2aaL3GK4!RG`oaL2#@%mlwX z{u@<#;DcZc&UW{p@LWuXYc~_f<(jkbz3xG3AAT=1)hvA_>ANQ)V|3r8%O8T~m#RzK zo-2Y``5L*48bMgeDE8pJI9C|ET6n-520{t?D+BenlJK_1^gh*v6u2N1udP%o5BJeB z%VgR}!1%FqFJA{+!69dH*AFleuEvlZU=+N7*X=mZbOhUi`?*&>u5n3W!}BSOCE`@j zh)bxjB#!idWxy4pEoX=`O+Jwi+vwvWR`q0YvS_gC$?@+U44RD;?R)EPk6ai#@147> z0n|9RRR4X?idUXs`ghK{I#br_u%Ch3M%R+L6++S9484(j2L}*Tbf8eRMhmyTpuK&* z!vxZ8>#zIm+l@DmRBP?@S^~l=PWWyK598_o`N0Tp4dGKDINt<+mf#{1ye@(xL+}lV z-YwAwBYFx%ewN5}5qSgR4o2)}iM_%9zi0n9XFGGB7~*RkkSR)bxBXssw0;kr>OSd( zZWk8avJ@ueZv)datza=`)@=zGztsUl1f(gR7}|kb*3Ta$>_0}za~kbou+jyQr&Yq= zr>DUQCg?x7lZQ>FLeeL9m*7N-1w;RtJMg;oDi9u2M3Z|;CJU<-QT+@#k57^)O6h-6 z^j=d5?5Y#Fp|#5ja7E_8zruDvhDjxuJVppOP~|MTUXDd``CI!M-L3F}K;f_Z0#Z@2 zfx5r6dm7pad}z`|9Snu7e$ZM*7@~p}xJSCb|9PydmW>`7I3!tJ_d!#y58UHlCf(a3{x*l{Z*{Ak*jb7@D2Xc>S}S&p z+@QU>g83gRKQMJA>CU@3eKZwlX>A?mhzw}wsxRueB8G}7t55!ax-Cv|4(z+*iWPn_ zOk9z<1#@i!${74(&~xsW-N$NfAR3nR8>0(BXw7&cN%*8G3N5IMc`QEP4Hj)S-M*cG z!phl8UhdjBn)dKwtQDTv>PuavHR@2pa1k z=e|opdS?QG)lG)biVNe=Z2za7n{DYJc0%|7gG&nVSRFXBxjg_Kq{JT0sgwAlk7sX$ z7u4aiH;zB}=Rzs$-R{=+xD<-?be;{f>bs+p=TZ|#cVa;l!=qWHpRL$NY>WJDZ!PS1 zvmMYmYKVRhOp%Ytq@l?6Hc6`XSkUU-Eu&P^h|^OdDEQyrf_04b;@f=@pvQ)Kr?-vn> zzU=3X7k?AM1M=o)S)^W@!s2*<`#l-(gXPMYR&*3({TQIj8Tv3iYIv{&D-A|nTX*m_#PV3mPNK92fJ@?Jwj zD=HIZLouk1cX#q7`kUy*h4>v;MoDDyRn`8$D@o)Xs8`5&0Rrm%CDWD%^#Hx7Q1k9+ zGq5n0MZYW}f#$b=bR0ZyiyoX|N#8aL0S-rM15_lY$cJm2Gx@Gc+%{K#}w zOqa~UrxMt*1X*1HdCbf3`f7VHAA3(tdAbw_N0s#rEPJ96JG1kr_XHs}DrHB3^W`}C z>Ub&p=QyNx>LQ1khc>*K^m)jV+6A*V)~iiH13>PgCI5nSU)b)}O#anC2F_8-I_fc! z{9avWcwbc{;KdJgu{TNFCJougs(!Q*pjN2(3GWMCaJAOR((_U{Je(|fZA_{PyGnA$ z{YZ}n-TJNHUfg#E+Kg$1bPuHgpGRLuqO2HRyq?K?aY7GT(EV1U9CpOcWz3cIg_h{n z-NYm8375g%OqP3Mc4lDjWAQ{9Lt_v>v+Pw_W`JM2h0`Z_hr$mAZhN=p^6|MRPbh=b ze1PBY#4_eGU6AcZ@oeuq7jW<}g}}!@ilA*@%1L<#R(SlZT19!GB0e5kcZf~M3NCO$ zx0jqgSo$%Ive|PFa8hSu$GKh&#P7iK=ZDq6(jzm&q_b{FKFEILkq+q|slJzMX4wK6 zuyJvor!z&~r^&C{JTXP#l<$I{+WUabwmo^tH7Q^uMNb_nr-PnFs}F*OvWR-Tyso$- z0zAn$Z(d8PMZ4AbL4LY)aKD)4!H;$)5Wn*Hag9nZ9%~shkcaJ4MVq<)(%kKu7#0}lP(Y0D&UY6{a=pr4y2sSDf<*Y z50LzW{i&aC3<@Xjw~NcV1wH-gIzmM&p=eQ;fu$+Q_vYd+uJJ|(`AVC~lT7xgG3uu7 zS4~Q{>T~x#`vbM@%#!z!fqV$iHCx~ z;~Vq8stZ#Xk1TSM?%tYlH3HH3 z4FAp9?YlI?MK3)u)spJz4I3q3!*#JRnaUBR7r8!XzGVgYq6C>j#Z2)Dg^b-?p=MCP zh>ELtLk1IGal&^?co+#k7~!oUdT<+E06 zG4`cxs2T~ZhIEb%TkC2u*xNo*pP5w`=mfvk(W0{eQ-%3S2WCA1`x+2S4)h1Sg?H49 z`7Q8?o&Y7M0dpLm)Gb=c?+G`r&Ic_%Nrg3AXXmnm%Ax3M>vd*r4m6-G zHeq4N#DhnCr?LesQL4)Ut|-3>I2XwAU?`{r>s=kaQW6`6PIsT(&w=Zq=$+L`Nl<|m z~jOVlFa7FRbsKJQs>${Y5)CG?5gXgia*Sb z6K{ImrHx{1edu#CU671gw9rRieH71qBHQn`4Yaw7zAxpvVPpaJL-ugKy`g zamkC_5BI1mgPfF4$(etyfS2@}(K-&cfQBN(;M64>7~odcyfL1QL(h3HyOoB)qw_jt z6dPukLZ7qGugnhF92ot+tIZs+iyurlcHR^1kG6zobbV0PfkpR#t5(oq^I%Pmx)1Ia z)^=;ovw}%I9IxM`0W6=^D0tA386P#AZCmquh*hrZlC`Ej#P7bmEey#R#zPJV_XW^A z!N2oPUsZWCiVMQ4T={v1@n?;WzkD(eaQj@Uh*ok5ZV2+sPCi`*R;oF+9{Tj*b0tDU z1wps*yLUV<@5q*bMY{HS7&3&TC-(1Kuf2^USB|tRr=)?)20t0)SNbvci62>7UrTV< z*Lgb2iyo}tYCHB&Op^Sg837TnD!>oTJuE}q!TWzKX2=U(kaSl zM5RQ4dFF%0-#S&0!nmQ3ez_W+HObE0j;etEqp#1hq(mWV!^}jgiD+Q4xN2-C<4wBL zrniYzCc{i}W`}RqwwS$qt%|K85YH#?dhRHch{txV)a|zvL*HdDn%Nk70fh@cKPypq zAtQ-du38dD)adE^Va^xv*x~!9ZXUB3tU-Io>=|Va-l3=cn64fOZZBFKeL?CeJkLiKejW0L>4t^Q{KAp=Y>XdQZkQkHdEa&%-!%tYpgFHA3qxfBUvw4 zaO*n)qsz%pkC$FUk;hy9yMx8fh5omn^=8Q|_;+ux@?1LT*AWlEZni#a<$4{F*YUjR zEi{E=AJX@JqRGJf8EF!bL;%cP>u2ba(n8^(OFdwK7F-p)N-j73tMjn@v;6lC7MSpg z6TVx*!$|nS2yYGHQy@6s1b>#`A`-kVf+Iul4T#}QF+ z0r8$C=IrD8n|kpz4`9Rj<)}`>a7=YVw9Uw`1EfaA zJq@_HEZ#y}BnGy`?F+~=C_(H;jI*=ayD(b?UJ-TAg=O?lt5#2xpx~}-yO!C9`0d5t zRl;v?!gP`J3e1;|BL1ElSc+Bm%0TJ?)y0 z4Nx=p{-aYonrN`_Hmz8g8~A=zmFWdp0&a9}*Cw9`fUav2HOAM4G360Kv6);^Ebrqi zgE_h2mvb_w+`hlSleA1m5|rcEak;7PwcAUa&atoS-0lf1L$;UV{@DpEdM#ToFz^LF zZtV+;&W>QJp(f|m1HF*?jBr&sXA_v%sXsPI;wcDDF7YJ#^+C_WOX>ZiO(5WQqHJ2> zBiK#xC}B0X6V9$*JRPTb3$#m)3hRMENK^8|LteQKt`1+YP?k*tgCnmS!`a%QM1tPJ zt5OmtCx^oC`R^p4w$r55msSCtWmh>+Z5}oc_vk^CsiuR6zfy zU1`Md7F-${E~_l4#5)nFz`-sRbBZW7IbDoI-e@Ssdv_6#V-Z?*qzi+_P7V~0ZHw_@ zuKqxxXcS&BPfUP;p@>y*_TuM10bsB#cHrV>CL({#(xU9+4O1`7b?#nI$6xey7MqH7 z;PP~sf!Vq>l=Cj62-aADwKMHRU8>8*XNYz)eB`TNJB zISsoi>EB)`E`gf%i|CI?2HL%<@IZ*C0r$j7J`4Oo`p$ZJZPBRD3Vr{p{a@aonA7>c zJJ@0Gu&RIevo{Z56w}$X35S(v< zKTB{C30@b$ksE8rvXGwbxym$;I^2~e5a4X<0$C3Wz0F(5 zfa8blI5W2z;bw}s-NjdOcvps!4+~o+y60cZw*RjKG%@`e!h7aCzG89vQBO$<(#dFf zNxvr>wkA~8+=(`YZ!5NIXJ1C5TcZKK)lb^t!fXirdWSm{sAej*UkX9d-|~i+ditSw z$%3fo=?Ekdan^t*+6$b1&e*-yU<)U5-L;J_?t+{%VxkfG2XR`#b0;262jo(n_{s8* z9eQOwFxXI@0+S6~=5`Cq!^4%u$1YW$!DGSR3$Lz=fkrF6XG?++$lt4H!RiQtBWwzs zRtqFALMgu!-6u+D=SIbTUC;x6`p;+zDZ z`i!xI(=J$&v8?%R(R&@;hQ8lcXibaq zgzPvuw(LE#_ugbhWS5amsO-YmsLaDbDhi1bk)%nHM1*+me!u7Q{C)p@U-!AM>-~Pc z^u)-y8lDM$7eBL#;+2joQWIl4vDan^sYW?8=6LH7qG(&xJ{^zGz1 z>qr>_r=GA}wHUgM4o0lUwNy00^^1SUDw^V;*s@RSX@0X&KW#wQ z^o@v%${y&oflV=)Iwqck&7BXKZzqJo z$n)dRgYITw=OUqmv#OWSR@Bt@t69$I4%OdWo{BKHTa|Bep?v)#VPJ-K`8E&6D!W6i-#k+&#be074VFzu)pAOBY2*si+d<33wrXD z>rLp?!r|8GeT;#pVBuq~O*NOZpuh9Lbf%jmo~)CobzfA%raO^$9;O(g-n9$O9OkBw zv%GCz+UQlx&Y5*Gey=l1cluy{E}oS0yW-&AT_BD8;xwr2&d7o*tNTAKRf@x5@$TV& zd4pyN+qHjpu&SXW-4i5_jpggyJxxwdh%Bfq`QLl?R%S-azd6e$u+PtX)fmRH%CY&@ zT|`e@YlN%yeE~0j*veq3JOFpLiZeg!!=8OD6jNRrSe1QKTy=&K5?*n_cT0E}2|pO& zts#601m~OJ&k|fjg4acGWC*?i(Yq!3U_?)W$j=hFE+TJ0+`)+bEU`Br-m}D<<&7!- zEYLLw`)(|8h8~PTCPhQeiEpDo0gnx>ii9QLxZbwELaG#oW*x7VIZVn`W*djo=KqIR z8XnH|2q5?-OR^$LiVb}e5tQAVdj#c8`(3B3K82oUmybk!Jc@+cn)N8)aU_1z^r2U$ zHnJ_b!%p$h1vLgfZdzErggE^4%AGgd@nB{@pE~LP8V%o-a{V`P=qU5^1NEr@@&fh< zw+b!rH9FX?&DkFv6n)6xyDuHyiJP>|EDiyBCi5T1*F(T?zJT$=#3DHLrm9ptJQm#l zZZS}w%Ydcdq3&Z;8c{69%agXq>58&qLcJ}1XmLtO*S`e*(fo9@j5;3iMV39@ zj&ni-YuD*6xf+7vftIMH#Es7PYu7cp0~j%_92&&*G53p2+)4s%BQ- zGiYkw7CI-Di+Y^=x%IX^Nd1WYrGMlH;YeMtp8qaqApNbVLfqC9q|-@ou6XpqPlKI7 zI)|dbWz!e*REw8Ddcc#vw@2?o@AT$w>#GSM-lAG0{AL|~Ugi4gmv1RloU5)(335V$ zS#Oo7oQ)9g7X|~)fmCoQzBznWf7V=6V54K!* zEvso$h@F4CaE!Q+JUACm)J`A20#xruPSKLyC7*g0)p*&1K=(r8 zbWXZG!TUM~pKFx&_Kk~xb~?!^LV_{Ck#_Tkj7SP{5ZLPPYrF({;QQL+D@ll~TQD^+ zAp*<7x)Lmw1hcZoXTl>d<5Rs|@me>Xu!vd{)A80oG@om;-;OH+ut&nDpxY7XcI>$} zA*F}D{q)eTHuXmCKXnD8|B`$lGw0Z>*R;_=tqWfH!s?)yb=I+>Edb4EG3>5k3;{Lq zrp~`yR55>`E&ZPmd-POE!(r2kv`3#WJ?U;}gxSIGkB>hJff~iWxPGO0a7`rS_t<~V z&@8cI`CbL7&sSJdd|pot?s@gB@i^B`XX8MUhEc8+no-WFFyxa)s^W4_on~zin>&*N z=?(_<1!e#3XQ6jl=0U*#5>Ms5iJ(=C9$>94^y2!hffkpo-^E5?cx#3uL-r)JZo^)Y>vi2-4xZk4_zoriA34#Qs&;` z%;0<6;K-dTEI?$BfAqKoH(<Tv0^v&g$fI!q{E4*lkrRong94p3Qs z%d;JG2hPf0{0?5{!c!M*-ZYhlg33|W$5zIHAU@}z|IxR8n9HR$@GXg#^o;fJQ~79B z!1#mX+$H~X+#0DSaL5h0Am#uPMxBRMRpgOS z;Au(*&ecu|Go1aiM;vB4My4$GTjGw)H@45FEl|Of3TFwm0-(@#VW1K>0eR_<`7ACQ z0naqXl{y=eN8*oZ+2Wcv969~P@p486p52zIx?vFvddMB_J(M;Dw&T9MlhmX>X1Q^Z zM29?xP>AI}XY*68Sk-!XM4TV*TC<_lBKaSMJ~JI!B=^PxkHc-3xV?cYoy=UBkRX^z z5YsmKt_w6?eas`HbwP}0C%F!mYXN2LH6~_CXC%87^+#jf7Ij_W60I9uZr^gEw1?aA zAmLESWvAk7z{+`I&^R*>Fnu{wR_2@t?w$`O7htsm(Zyc^WG=o+V?!aqkrN} z0#ej^#2Wm8#M2h!Cs#@+fYw&eAMfG5FM{$;`l|<{*|1%? z*)f7P1PR5?KbJ4_0p2o97Yq!2kcjKZqvFzJ@U4#{KY274dv+zgVAP zzHBV}tN0;pm(5G8Tk%o*hwunCg_ zCiq-u!bzdsJb23A@u?4)86Z>6zUT2v0$6UEwfqEnVD~JO0x53*t#4HRyMy77L#d7L z)Qvz*o=b$ZlQv?D&Zmrhp$bHGvYGzPSABcL_5kl&x#Y zF+}1f3mwT~P`xVwI42!8qSsD>{VNLGvTsEG&nr&&ZV3+~;RhqUHH1%r;CvJOS%Qm5 z@VW?&48b=bdbdO$jOZy4`B@^@MdS^LI~cK_CH4lydzP59BfETA=$9H`{Iv$_7x(oetN0Bf-U1xgS|O8`=dF~HQyEJ)eI|T8e3=dR`kInnT&o{Vcwv}vA zM*o9lCn^@C*7a90MMM%FrONiwyk&q|unL>#V-YMjyO87-d>&fpM>Mm{38P5${KuS5 zykO%{2EA7hKX^6Mf5S~w8-LHc#a0_{3qLlr27M7yKw3H-Dc2d#f}pweg&k^HKx5sJ zUwg=fG>%{S-6$!4i6}v*-j$r#MUz!p8}fhtwy*G{8IU zj{dQ~Nx5^iAC$f95~$P$A(!X4+pWBcv*cwEtP})XO7uPzVJoELw4`476^j@=YbKcS8=#;G;(k&n3R*- zByVYe_K;$~+C>A$Y^?lHRJu3+BtoJSe7Y6@m@H&J6TE7LN@EUB(ue86kY=U)&golN zntHY}sKy-0yDO2O`DG5Qvz{ItJ>`M4jxfDgC^N=>`eW_p>G`veNpmItgL+rGY*Y6l|_JQ5UKYHJSW z&);kwf8dB#wd?=Rl6pl4rM=I6%a6k=I^F(OZCCME(VZC`V^^sCeMgQ@C%kI8uR=^);$_lLuZ^nttAtEJbh7Qm$S*2UYaa$roDY@hpCalj%) z-_G?$3{3E}e3tyG2daI#*PP>}P#F8A8zl$N!ag@;!HzC2U~Tgv);>xZxy~((=r8NR z4YQQVho%NxLFE1ibtd<6l|`mFQ&ST%5cb>)fK zUj-zcRKoc0JuAnYIsb3Yn%u8?u0lbJw~yUyd>`n6};%n`dkss8JabSFs zbDWtIz8<~!?b0Vrc>nq0N`_Sm+_|kQYWc?>I^W9J8L27;k_8zLI#V2>^O3lWPu!t! z?NL{#{Aei5@rCHP)sK5&K~etR%fTipiJ307AHwu<0QZd$U&b$9$u{p+_H z(@dZ|M-wlZT?QqZ$2T61+n|!+!=}Ox4nVnU^G30rGOTWSyA`ra3#L{x8;^b!z|v%1 z`|3!&M*1s$HQZ|6V3?Ltk3Gc$zM+#lPHuDo9+R*V6Kg$>S6-=ZMe-@2)nD)D_HD9) zvCss3EAkc`W?jiNIPXW|TOQkx7jwX`bFZFOw^9Tzd_4jtToll8TgvsA94*kfEPi6i zO&#uWZl7ftk%iLRy=lSh^0ZK))wzHQd<58edo++>yI>PfAT6*s za>I7oTpaCDIL9$qegOvTQ0g;p?uJGYYaLhANV!YbF#B%g~s?|Ev3!j=?lO(_P2wI1*r`gu^!gbFY%Ic18n95tp#H`W=MK}*t^C{IsYN`9v z5Z=d^TC;*W1a2bZ57v^xy=mBZ(c;Z2V*}1As+j%4eH*EY_1p07*(X0;)_G8Y4|@!w_{zeGjL5|s`6uOhyLr(l%lN;!SMsGbg_0J;Khi^A`4$Q z3Uy54@?E%%#dj#;YztlSyF;QG{6_*{D6;~CePAL8W;~+iCQ8cR*!|h`B5`h{KRnT- z_xHzVnD5DY?oq_`Zwq^T{4RmUxqgcv^=#C5;O^#pXAyR4veP)tcp1BU97&hn^v0eO zZBbfr?tpcds6g+y9a#21_B_MnRUs>4Uwv@6T{GHI~1#%94ScJ&&DUy>$Y1YLYE4M zIT^(e{QkbcXHM#93?g|L@{>sY`!&--t5kKUmE&6)8?OtmedLUpD)EF@+k?0GTI1ll zQi@qY>@T^fy>WFFtbt(v2L-*@1Vwly(}o^4$ic1{epFI+sQ~1DeBDKm` z2WzSo+?w0Y0OP#hPXD$Khhddsk7^qNQH0BdH^srWfNW-UH|fv_?`8_lDmyD;pR~(y z{wzoENB!MW&mGfIRMf`uy`3K&!NaQ!b(rZbU85)Z4)$NwBhU3DpBjD-1{NMsO z7Cxdl^5hP*o^JY9Y=uw$>eB*&E6Io1X9t2^7$S9#| zDIH+WB~JCkjt{V}6unJU41oKx26R83G{irJiq?vpnX&C|y%(=e(SdZaYv;a3m?H*i z`gW_qROoIQk~QEQjHTEA2Jj~wf)_OInaq4KM-wAQR(HEOp|T=Rmm$UsNOk6p_PlHs z-jX6$bmJ^Hy!7V%mTh z0oy!xD5+WZh;~UD3zds+OL2OlGQJ@iuF&J4N@ZJ8n9CM18(S@>a3_=cu8%V0NHJdQ zCu?OVA$=Y1IikFk(B+R-YQS@C-5Q*#Uf!hmC>n%jInh6Ea0lr}t6P|#M`CBj7X#Oo z??UY#&$7!N<-u0t^4T0#A22jkCbO~>1lYS6M}?WKaF_~vZ@`aw*wc}F(J!I|etzlV z);8w>u09huKh^0B$gF02jcTsK|HNwd@)?)Fkgwy)c@DXdHnrjb5qjx{ePTT#c29aYFYEXU5jL zG=RN!A@yAaRpeYZSOuw4;QGUa`J4+b(7*lBJ8S+pEcZ}!PD>*l3l-E;a8HoDJp5{l zRu_ChgpT%+9~WW(4UZXX6AQ`jG<#1eK0X$1%$#{Iek2V`j{NxafG+?K(+E93Gi3?` zFVU{fe~pIaRJ}qh$#z(8MD)Fq4iJx?{!jbS(fze+kTW!3D0Vxyw+ z)LE0EYT@DKS4k$|38O}6s^(cVte^LwLdqJd@-xTOOxt0(!vQg(kF;?TgJi9chy|d1 zmHXP?M;&e7Ir>9=Cm4=C$ebkQ4KRhi`oB9EOg}hRekwx|vFNY(2v8hAA$l}EBCE$y ze7%$7zd8H%QDV9gdmz3Kb%)p=y26H}iZr$)Js9&>I(*JB79S0oO;t53z{^kLDuc6& zar&E^wY?v6FyR#^e7A&$k??~N-kR_K>r)^&-vob_;35*dE`lRN@C}IGEzt)fdJ05- zmdJGxc?04OM(k&ay#eu_CFZQoQF$~g&cKV_T?cpnAj9|)}s zx)veFE}=G&&00K`D&H#w*Y*bhSN%UisMAy3)CeDM-tK+8+@tL8xu;WVPf{xhr&*S(pD}7k;?D0^iD=((l(r=lpB7*_f z_Xhe>>Lc?LRE1puI-ru(Y2^1;Ut~F0-oNOVj|~`^lQ=jNp|gJZ89@dU+-Txeyp|^l z`P}FZzx4D4ay!>uzTOo9v=rRTP40(4pG)s!`|C0Qd4vRUn)+f_;m5g|E>5s1|L|bn zlqAlvTk$v4JP19t73q2SzI9s1jo;9=7{rym{m&=-2eGAs!JwAF5FSc=J)uxEfVH+N zW`d~u@BvZou-X5*F&0seRfBsZ>Jb)E~v8M*2T$I<1wEN5%i~3%xeD zo&uKwj^(2)7O(a$-#SK3#6>^1ZXg<(WrgjNRd6e~s`yvtO~`s~)aIIF zF6!E8D$KRYg|^9WEZOy`(C^aBD`d>!pw9crW=PO&XvDjo&ma?zg`E4RANCdC4E3iT zPD6fx_hnBo?}i^Retk+;F8emD7kj2vH=T~R!_vz?i5B4qKJP{Kg-hUTxgklq6$4`Q zPfJ$DCBb+m=5}wLi=?}Lv+E#z8g?dUY2g(MM@xala-RFngXnjn%}2-L(D{;nS{A=Z z$Pnf(yV^tI`nm2}%p)w}&se&T#`y@4aJI4a0je?1T(@qX2? z))Yvdb!=C`uXsG2U7CbCY~h0UHy#e-B#71wR@G(vz-8qppWe6UGR^-8{41e{eJZ?d6ELG+kLq-`k$=jGnlywc?cVmDY? z{AA5g)gHE!x?Bdhrb*G-b=nAiV)Sp>j!=N^iquJ!6!GBqM6OBv`CzmZXLNC^on)4( zXU$A|6A8K5Y9vT`1Dv|aWw*`di{9=Mw|G+P1`4bcGXyD4fF=EQmw)eBx3O#gZ_WnG z=2Le4w#1rejGwjDDB+R#t7d`cOYqh$nYqEk>9&Qy%=srEXK4_1vO z(}|tObkx@@KVCP-mRj5SW_(7FslT=Fy-+%=7mxMPrr^h2W#wQXhSYC7`-H;jtRL?6 zzO|RPSP^G^qyFn-U5N4u_7&5}I$;L>)zH*_Yb+Wi9;z){gnpi3H_yGF4MlpUcLfYM z*_zN7&&i9)OLy_? zEg6*?XA434JB*;qswq}WItUG)%L%Z(sR>1EEyk-KspIJPB`psM`5;|m;Y`7fHE8~A z$25E*6>)Hz|L(bY6NZJ440u%fV)6w}>8BzlaF_BM_LCxN;P=l6Dld0k!1H1y^u-^2 z6kLC)F=zS|+^}vKURs1-9>=k(w4m=lv%AbtO%K)%&t1K7O&7|uPwuYgk;eip{M~ZNA}}W`Amz_AspoJncfa-J z4LObEOgGz`!;p-xg6e+lFdQwnqZ!wF0dI}fy*Ir(3>kjNG1I9Iz+!IE2p6|r*ohf> z(eoC_uh_5Nyx4$)JGEslXq6!)ok5;ke)k~CjjGV~s=$XO21AeTRH0r<#{bk58{wR< z{+F)sEPSo#WBbMCJLq$qJO^igB{Z5Z>3nzRGS*Z|XznYzjZQVK?e3V*A@L|atm=^| zBLC#q)TPRbBo4C3HQQZ@a4Vzkm=H$}?t8YsE+FLw%EZi+ zZ6v;J&0XH3Tl=ywpj~T9J#`zB{cffecZdV@l&|VXn_PkK=PSo5xiX;rvOUK|68|zI z!A`0{J(qMB?b*&6umBt0?Dvez{gB-VQ}OqeOMr&0rh|vm8{QYctawhFl;`HX#BC^( z4BvQNYOH3-!>K_kX<0vAk^5^Fk*z*=Fyi{?nRBNzDt;gG#3nQjYX7&qWpXPSTVE^w zu+SF|jnY_M&b8mbUSu-Q^TJt>Kvwu;xznV6#G}&;OuWHp?ZAvYTcsaxOrL|*=REN{ zZSGobhcIl{GR#AJ+aJ<14!d@hC1LTS1v=e{8L;^~`EHvM1!(cZzxemaIT%g7+GhX6 z3s&Z?DC=BFfrV5TQjV&g#@)fMBrRR-5xT2smS>`f={wS06K!~Lpz&3o`$FMJezSLr z{AHG8C@w4}z`QFClr@bUS*s65W~W5?IMOw6eAC7uGSa)cbTjtu&sl!> z`U;nXEDT1{oGZL}!#>~z*{pKLo)aK{(#`wd-oP89>;K-fYFXVw|K_ZL1ykCfvJfs_ z8IyAmy$#WDX1xzr1AN&@`TbLS0}QUa*c^Gi3X*|4#~xUf!LNo{N2^(r(EoYG|JQd* zco+#k7~!oUdPS1mQHBa-h-g4W<|(9-Oo=juO1WFNDJcyq zQIs@GlBrTk-?QZ2@80|U{oeQc|BvJQzu)oS);{W4>v^8_EbDQ0-Pd)V=XLF9V|~N9 zvlq{1VmvrFm{^&(C!dnPeoZ~9H_e#bMP8HRPE${qr?yd3+rrt;{1Ti_?WPbpllsHN6gYJP z^AwVqnSbw?f+h9u=J$hziF@)0Kea=I+NHB+x7UtsBIf>HTU~vp&iSu4+#F0(*Aii} z7t!0j+t+2sF6TY2jQ3E7{>#UF5jb^@Hk0ZcCe>doOiT`cyT)Jtb^qtf{i|y@{_Prn zzej(+%D=kCOzJ0N+_4!mW=!5Vj;Z^|!Ynf9_xnTta+w8F--K~;#`hst8*>YNW+t{j z;z;(?yZ-W~KfMr{dTwd^r=9=esp8Z=Z@YhS@K3umrk;nc|I^NY@s#|IM5ufJUp+`1 zb6WRboi%-I`VLIrf$2LieFy%h?f{AB|L*hu-Ep05u`eENVbh*m?5H=FEp)eU;2nqL zI;kR+?^PvU;lCZnsdN6ETbNDlW8^J=-fy|idNOxne6K(6*D|7y+G zvoJ6;WSm2t_`mxe3+pvTYs{(lP(M54J&as~@i}BU{a2qu9j5NmzsoPOc^Jo;n98*$ zH&cK8J-_(#vFYd2cVPMsOy7a&J1~6*{-^H%wT%DqP@!tnr!{N`1>8EdnZ7JhZz?R) z{_u9SRMvwy?QyWX`t_KlK)AM@_xId}>R08heZ1-z#Hk|KQZ?9`oP(y{BH2@0&e! z-rs-jzrX(b=eX_O=D|eW|G%F{zAsb$@B3Oi^|_(GzrQb`1+u4J=js0bx&8pd-=2`) zBV#w?_pK?G7@Zg)sm)Uuty(g{^N+(xcW$-NktgihE3IT7g^CSK<75gij9i*L7&v*R z6Z+`bm0|c##mODbS@!Nl@8L8~2r7PW|BjP8BWFH%(4YNdapJP8PNJ;%$uv&>3X{p& zoa|!66U%IO*5tAq|5%(9Jlb;m2f8_p69S58U8udN_iIAK1ui#r;eO9 zyQ<;y{@e4lAEvsMfAt=N|9{V84Ld)}OZanW|0uAw^5mJLtteHaWSh#ay|~HuS^lLN&K;03&LmoW7nOGL$Y+$w58MQ&VPfGeDBq>JMv!tW8+CP zui-{}zuxJ1@-J}0&>~4Z;hZ6Jg#XExe=JVkjMFXKv}aGplfU6)nzx_k?HPK@zvJzR zKV+J>|0_GR@mh^Y(v6%D=0t@Be*+dp?0{!@HP20mJ8t2eenqnYZ%A0nT@kpVDTIQcVw za^g7S@%|r+lMVM@`fSh{n#RfBa59~r{FRTL&QB)iO8kqt5mUgSHjF%WIzO4tPpJ7L z|1D0Y^ONcPB=a<< zZqx63M*VjF-{T>v;`g=wI-%=7<00_>I0pW!5BQ(MN5<6s`M(Pv|JA*kJ~n*^rtiT2 z-FM(|h)O|n-c6W$@m?WI>_ce0=}Q3;%Eao~>)TIUEyj_iB@?C%so3o5*jRt)emFeu z@-`LQeW3sRgQJHszr2I}M#$c1eb!;QPE zThGnaMAy&G%DXcLfYJ*OK@Klz%)DpyhchA;P&>0zaiA08+M$ny@ueI{J7!fwe~czl z^?z62E3S=#69!K`^iaWjw(PUnEWHTYtX+M2?e`V1PbJ4jI z;zeVFW?`o0I|q5PeNfJ9;rb`(JK*EsPLUfLN_bcKZNJ#NUg%QZ{WF5K`nc4=yW!qB zQ{;N&rnS_kogn+BeUf6d20r=W)40~BMVM`+-s4L;GhnvgLH$5eeNfHQ<}d5A8AVpH zdObT}jP0sUE%hpg_@$(T^5zN=xX+@?Ui_#bSZ;fsyYZ`1hSkpmjiduugvk zR9L1pG0e39Uf8jBUaRaZSTL&c)8zgrUF@@HTfl`ttmyjrYPacDZ0VM-yj*Gy?3XAD zR$09cu9V2&%k0tv{m*9T**;o^4J@~HSJ_&@_K0oQcE#Gj%{PUOitg~>1@1zPY6aqO z(27mVVMGe`^1AJ;I>r45j^ub}#i?!ShQ+3}+!`*G_{eMh`huUeSp21fK@a~qY(D2< zS73M&!UF+ z`n8H%VPrC3%gE)n(h7lAavI;5TK&KgBa;yMpb!*1$NgAAsW^NySDtmSRvV(4lS0?u zspE|;=LLk44G?$kk+J$J2Vge)+NX3ad2F%bq*Y7VR_Gs*7_O9Ph0FJd8gDS=LwN(z z@BLV;(D%MxRUS7hbo1VWF9W9x@alH0p3IN((8qX}jMnni@L{|nb3x@Q3~qI@F>rMC zzQEfYy-oPDyYLU?NKg1KZsCTbm3tw_2QMqf`Vgosu{9_sQwg_~qbHBYUBI|PP2ONm zIIN8bN}`FZz)33~lyllU0TF$!_=8tNpm&=Y%V3&5u9rTxqu``9640EfW9YF7TXUZs zVY=;%1pCDgj)#SU89t}B-iug^gEy@|etA>@w-3n(EqTum_b}ZxS!iYf_EkLJ>A|`O zrE%&ud0lqG!4B&T6y$C28Ua(K1b$V>Kb-e`c?!DSnA^^`nChBYMsSRmo zd5*|=?Swt4GhN>>Z2}UPq%^P1UWIRs_?Wdzy2A9!`YI>qZ-<4q+m?O#ybSM@w!F}s ztqBV^9Jo-oMHUp+3X8qIBJxK(`U6K7vb`E{ExZsr7TAz^18Sb%|C)0{6}&iWd5QC! zlX$H5%kgj5j{ylwIoS!J44gG8{5{t$4dHHw3~2hEN-7>Xibn?9cGS&J!zw$= zPhH-07;vl)4U!#?#;Ka@;h{wbkh*X_%n*$M{cqV{5et-l>)>8f*Q|8z%=ms83<`K`tqmUzqVEcd`}SK2lGZ(767kjJBk-{^q{tnsvh z4-P=~b?`{fnytV@HGRIio-uSua^}6Gwh~{KC{B!9XMi7w%=l?>PaD2h=o@9kqm}B7 zlQ>G;@XJ0*_8|WJ(LMkAhQruZzwx|z@DZ%NbpC~tFKuA1$;875&r@hXIwQ7HFan46 zN_21Kw}M@Z$7QqzkE3*R?i+@m4`C)5v72|duZNng`}FkdQjplMY@3+tzEJ&^%FIo` z0|*)Mse}*5p*?QZxytOe@O4i5TGb|X*yd$+VWInS-0Nq0tkFsdu@=Ut?jPF^wr*Ko zZ+vtoR9>WYStW1-Y`g9^D)otmHQ&k)E#B{m?kA2P6)y4w$;*`;3%(ba8;*kDP8Ukx^bT) zzwAXE7T^+?9Uhs0j&h%R{=qH@r2DUy@b^-|r495wEh?7yfZ~?D1Km1M?Yv}2Y5ZZ7 z-^cS^NYe=!-#pD0<0=PdvfjRSJ;E1{-0)>8>|YP7;}dfgqn*&&^!kCYJP)8PH0wxF z^K5imC@$pL6&sj1bO#ND=-?YNM!)pdj5wCJAE_M$B11d)`hey4FBeNS043>HsGvhkgw9Qt@T_6LucB@M*$=QZa2=}D9 zbk+lDQP;D^{%N?U%CkjAS`A;gr@4UVVl60U%e(GZ>^*Mlh%N=G*Zh~dtc2dq>|!%>EtG+)ZfByf-;!E?g~e;AwkUL%^x z9qReWU*(R_#;xz$df647(HW`zTqaiAf$>>~xR1KF=v7t7(#`!g_}cFCzf{E3aA0^A z*R%O5IK%yz>@C0tP4i?*?nR!2wjWHk@WmW}Twc-dEiUcAa<}RuzOLVg<@#3m<%^}F zOdVfQXLVEf%%X=+`_~@)HwSG@8NyR}r`vjJGA3|Z$rvtce zq=SUO=D@jUmGMR0v!TE4DdUi@-pcN`jBt2u$XJrOH;50&(Aq8-f#PPAdYn@=z_+#- zH5VUs#SZxrhItMfaq+09*x@4pocM|^^QUhEDn}jF$|tr1bLMTrjpwCsc1@c>b%r4B zyWKN(@1h+3&YZua>ZuZbam0ExBR@(q+OEP_*Ou`MW-;PX&7m9_29C;&ea9aPSz>A7 zlDEB&w?Pkp57qXREgD^u>GJUd$@5F~{feMV4gIU7+>Kf(4rloUp~!y4%+D5d6rH z^LFgnBY5naWb_Cd!~<{dyqQR}#05*3c^0gULQDJb8IKb-kmT7UKO*ZIiAMxST<@8b z<})`zaQU68@0~K>ZT_v=rzrt`wOA8=fu{j}yQ2@}OLIW}v)kvCTN0pjldnVIl?qsU z`f8tgVFl>szjtB8Bpu%48Fsv}C<|O-=DPN9egWdu7&%*6?|?0OQukieSc?|~H%lj+ z+6A)>E6}o4NdkJ!=QkYa_iMvZ7?dZ z-8bW*JJR-+&aY0l0FO5Ar@QJ~!0M_aU26Hluxlp$>NQ0xJf|a~Xic&+)IJq;p31YK z(~UG;DnEiu^BeT3bq#xWR9tmnc7xt(m%1q&;lhnAC4%oGae8XH(5CIqsMu-choFby z*fJ)4)qJNzIO*k?*ZMa5Q2*yUP1d&~FsGc@*2PYdnD*3L@=M=d)Oa_v;LDyv_-XYP z0l~$G(615B!TZ7apucRf(kBy1Ty65aW0#>7c6+9`-?#@rm4~#iS{wX8Qeqfa3hgMW zRdIU|uEd7DxuVXGo$$f@)pSwUaBb*uKF_85Q8;)RkXd;kBLda-?_C(4=!Tq9uJ*lt zB#sY&Et*nhmN1b`&th)VLYUF^SpTXtAATc9A1%FI4)fBtw|2x{hPyW2&F&pbf=1^n zcVDpz!4jX3{5aux1h}1k@@|JxAnYuu76T{{F5b4dcUx&9oO6F~X?cJXeElhS{+CC3 zQ1Z%HbG(l=^0sWl%qi1V`@7PM^DM?m}(4M{SvM0BXE2FHttQ z5$Wgp#>`*P0-4&+FZpuD9)xjrzv?Q!j2;Df2ulXnLu>23OnL81klt~tro=N&P~7yZ zN9;aB*nfMBC2(5~+IPwAW?T1R(4D)eabM1Mu;+=1XSzT*{CXxt<&)@oSeu+Yn7*bA zn2FRG7Bv)rCKuOC*06FUXuI@g0IM?;tK2~|Ew;v-9&`3)=x&0eD<4Kb$aVn+!xA1$ zn-hW7hH8OEzGxuF$`Z`7F%|`RGM&^&%>~6Q6DQS^&ZCEoZ%+CqVDPX==fs1uc65Gi zz4MEd9FW0lHOs*v1&BnB-MM-z180QBur=F+gU*}M%3_08Ky5|a*QcRrxN-Awnn2oF z)a+aDUEx*>xV?^+C%EL|!0Yiz8G^OQFrKA4u~q?2EE{3-$g{`wNTg@5(iL%~%%ypz zhN7=OHqN-m83d5k*E*YL)-Z5}V$|8dAl#w6+^h3o3NV`iPY4@b0)EV@GvD+UqXT!U}QwbBY@7`fI)Bz)Kw#yN2dKIYB^I;uyO*LB2!T~KRxg>yKX5jD3`&ueIEplMIbo8(7i zT_f>`;7Dv?vha5AE6~2MPkH9P7-&}Bx_@a)GMJrGvN}V*8p`CvhzGUBLyjx;Gdxel zgA`lez9j<%Ff%h!w^^fHI}+uo~LsNlhzeJP7w;pnLAroshQPxR8 z``L``0FEixUNZ6oVfkfsGDD%zP}xU)Ns=#Q((gQ@mZytbO>+&-hiwH-rUxVT1|32t zN?uBU{une^BVKtZ%Ml*T^itH&QpRTr9~6JIwty)ooEsulC9u}ukEc}Kk43Bm@~L_a z2Y)S&r}AugN$99Nl^?-2{=)6lx`y{CvVLU5qwO|bPnVrJi3E-+4e6L&#+m-Sp`HuE za510AQ?~jzR2}t0py$9Pd`s-?7UQ z6}Qd>hmSvqeADO*@5dKA4hpV;jh9$#Jsyi=Eeo?XaOEzvt7^-H$--1L!#DVs^nze$ z-NW^H)>03c?Y*%^`;!oU6`Op+Vo3zByQ=zRq9PP6@l-u{(k=!>UN;=5yyl40jl?TQ zRJ7pSQ3p%5L&lg%aN^4BJFjTFmCw8~+Z>O#?mV9LGa~|fS)P}BueArWm!%(R4R(jA zs(oS-H*HZxJQz^D?Fo}Ax{lU5hGG!MX_TiEjM?H794xy0@u%m<9OoXf$K2nKsM1#M zL_sVT8E;;DVZtAgx}Vf*f8^Oy`4L&yNIW7q>Rvr!S`}Lh)BP9O*c@#{{Kw=fPMnN^ z{iR*u2?U8|JISi=AQ?=ITD+ zyOq`0)whQc)2{RcmW90V!RGa9+wFGY)W)k{dkWMchlAG{&O4z%VV-_&R7V)1wYctn zEVvsPCx6PIQT*NGuQymsjm@Btjprj3?)lJCqVIBz{~R27xxDJyh7$N);>looU=>_s zq-?OlJsz4$4WAN=*pG{iX+L}{ZBfwfEti%q_J_xw7`BUZ`9Xi#{Nddnqv0Lh)X&{+ zjXjXvjZj>!(E6X7bTHu|34cWDep0WIJe%Z4WL+cih~Oxp@^RgB zsb<)Ftz%~C<8*NJx~TY(&6iOxTT=hWkXmRRuH>~|>nvLRa^b*^%bVang}Fjw%KFe~ zS$EyW1zAY0pi^#PZ#anVJa#eor4x9aINWA2=Mc>M$Q&8_!4`5$1#qZ(mjEgMa2Xk= zGr-ZLp+Q5T6qTfn2i|wy3=7{~|0z{ui7y=G@e`GIgnBW}4eRw?fq8@Q$A>y`pi%tC zXNjdp!1JDOS9upjBcUQA>&B)WAXFkQ8roHemTqf%U)qa7e}B>S%=-_Jm8LYup~162 zU=`g)OSc3#Cfu7`*X(y1iU^#EL_c29ZbmPu1u71@B}a2pFzft7^FuF`A`P>pnW0U! zU?5|KUxQ{g{+8wt=hAx(rLQ_D@$IY({OEc`Z56v6?#x?oG+N0HbwBp)ePOd7Fgpb4SS@XPxx5ED0$PZ-|7PJja83=qe_p0{OIu^nPpXAP*5>b|MfX^&Ra56zfcrk zn8n_5kVO&BIvo<5a>E*~^NW{#`0g|qrTO!JYmEj^bSxudsQBHrE0Ivx@lnmO*c5O&xmZ&9^I0%F z^uAr;a0{@C(tH}XG!iJ>3EjW8xDjl9{sRm>cEEQ%M-w7yE+E2Z6CRTAN2Km2^%}{u zNq$7uH4={qjs}ddae~}sm_O3L_jr9Al)V4xUVD84cviii``*wQc%C&N_Z)Ww3Ta64 z>oPlqUh8b|3p_~ zS%fmabJXt9W97B5I@RTKmaiyYXctjO@!2?vsU?o$A+fibZZpLn!8fbEdKA^@-mm1Idl1Gk{aV##Me&X6qkDVG=41XYz1T;5 z5dgMHe##9DMHAg|mS_8-LA2a#ht{7C_(@n-Z|vt)koDX_il8yVOo6kSYD>@1OWUuE z#j?iW7<);(9Zg}lWY)pvtl558BJEMsHx5^ry*RlnmdPCD&zN^(CyyuO&{AL*Hx0&O z;e#R4CHru3PT;GhTle7QyTU9xY}RA>CS|dlemlfJ%USs*vlk}12;s8{4@vkVQumX3 zjpW%RKO*ZIiAMxSF?}rX;OT5wzN*Xrgi|h>?W#L-yfg;B+E$shaAgEAj!E78{Z|^A zJrogXz)!=)2hQK=^;gCm%l3>^y4XVxaA4_s{%z=G(|%2MEiW*e2W;T+w#9GvfX&&O zCNOZ}gXXaYFQ~oKTBK(PqM7D4wkvqoBm0yxjjvK$(W0$u($7S$hDjrDp3YMnp>c(7 z&Ytb8hIMBZ_v>1l!VF2y>?RE*D0x8D`%{`e>RLZ6T zZ7cXb%4@g+NtNQZ?dJ}l>L!~_XD%*-!x1Wvf-X4V9M3y5@_e+R_0Hrp@6bJD;0~nM@1RT6pp3R~s?h?$y(ObQC~tqY1W<17kE^r}dw9$}Gl0KU?ZDERVz5 zo+K^Nx4CdvQV#XMK-~}DY?d~;TnCA*T9l&7pes35U+kRr9 zZW!cf$(b3*ZV4@n%C#!5Xuv4$eDT3ZSG2OeenhO&5EFfk=puyACOjnJk4W85>NS#Q zll+LRYa|{K9En_>tC~0aGW6yd;PJVA0?qN8W2oh53vHKtv#wsO3{Mu{jrH+OMPjek zT)1{82&kT&qp?k33kZ@mX3E^S7k+6t^)vORGSpa77Liqu3xpfQ?&hgw0@c#_A`KiLr#@~IK`Ju^AztQr>)H#TrsgA`}GP9_TFG1 zbs%!GUgP|>M85WB9FkTY+TT)K0!*RsWTuLhguiEh}PWQzBQNnftLs*CQ@Wb!%P0#MFbkp!iezCf;=Q|PIn z11uUkty&W2g@-xc;dqrqU{~lE@alF6C~e-?mT!}fdSAV?x*VX4IXkoVq~t5Xi5jl# zmL5GgH`5MP)@FdTD48ezs)xXa_Yq<&U-zMBKkIp)DA<7?uk>tx)#QW7rChBSifX~t zqSk6J&1^J!@crH%j|&KX{~Z4lhCs83#K;fg(ZEpd=iz%UIiP73M-R{E8=&w|&DXL{ zYcP~FU%&tRHNf7s+PLLOC(FgVonNrp-_9}_9}CD|GLNAU>S87<|py1mM#e` zKkn|NEtEoWx)-G`R=ELI)j|c)8>2W#(l_DVt$BbwlVC{D*bxzg($AkTXot)N)2iU=FJxq6M|R2>bFrkEo_uM z>q+Tr(DUXGM@kpLdvsd9QhYWp*d*^s@sRk$$&T|3{z&)*Yuxv)6mVYSWICf>;|>UT z%gD2%PTOyOEt3WUKPEad)-_{w#Y>EMw2Ae=qvJE;L0tLCq%Na0Y;*YQ_O%P$fYWx7 zi=H3TK##ugU1MDfD3x`^E^VF)WV*MkE6A7`%N|@Lve0Wgs!}y}ka-=BQg#L%2=(6y z+j8n_jcV6HrhazE?|o0{KRzlNe^?v@Dig+Jy@U3kWUG#gm30ANBuQDBugVaQ>c8tV zlv0FmXH>lA*{hDVz4(&UM?TQJ`^`x_U>BsCv;}_{Vt=_{n;8~4aV+vf z_E3##xZ8oM8yhgu`-x6V^fjW35I&pmkc2-Xbw8=sNS;mdBeJgj5s#*DL_6VhuiR*u z#$G#qtEaMqwxEPJhpEjJeX`TnXqYLCeBJGY4#gOP>p?R==Zx_}!_SKZUUw!?LgQZ3e>Uk{-E{sFZ7fD8zTgWR zl8yJ_XT7Wh4x9Q4r7c;3e7m=@ofB39PsCSx=Gm=BPsU65vupg&jfEY#S)HMv=zF)F zpUHmUr5vVPO1N>Iy2)#;@!; zBPzZ}1J@SqBeE6#z~|uuMm)mDGC2}A>iB~vJ~=(#-#5@ab4piVzS&0?9FmG5;+l7 zuEr?#QIW*cvKo5aTg8(DYp>HrH@(=f!2K<4xxMEv=6Sl{%ly17jTR-4Dn9=1*r6Gq z)5(0X_qqk3y^>4RaY+m~)K#6jC_e@C*8Vc#ukZlTYZu72EOJ5G$yqrD-#+5&JIEUuP5xfN(`8}(azM;(MNjT*V}YZExQi|%z_(h& zH$wD&qSF$6jp!nT&n7%1;g3k&PwF+2XOsMhtZO755gaW%d-F`Hbsz2K(AU*RmusQ^ z&e_#&i$B+RUf`-za5h5CZrs;xRlV4BYy*fd%^v z!Xki``#lB^2}_2`nku|cfy1){tTV2K0VmN@jCu`5%{!8=d*URp=G=0*dPxYlo-*o4 zt!q$zUuoz0z65YhJ#rF9RqjsWnIo_1-=)&d+|bZNvio<4IFAfzm9NT>?G+V6?4F_r z)6UMr)#=-%WzH~Rpl-g)r^5+(-hadqE3bk=Q)RE(WQ}Uov4*|4&)rX(+qlZ==Ep8N zx1r-<%`HQ;8KNI#+pnpC+kx(@ZWJgW-TF3V+_9_Hr7!isJ5!0M!@ro|*bhOyorcaxjMre7!XnDwq2n60r)W1~ zJtp$_iQqcm^lGti1)mQ1`RSERkpeH2%^43bytx(-{}J(x5WS!1v_xMcx(MO32@gs5 zBU1O1dX41SBtIhS8i_{)M?Zsoc>Px{f~AkdZCRhqgcmfP4Q;+Y2XD-37s)ue4pp@o zbDp0=Lv-OAzs$T;;G6ZL@vHZUK%OH8bBd%z@T|aZ+iu@dL{Gou7v^R$qm663is;An z(aiP+C7~V`$a~H{Q0GG@9frx9^aioQ7@mMb>lA*`)8wpE8UCZ-`Dfi2zL9=a@qC8f zZy)zbhoRG!t|+<2(APB1b@NiX2;4PjIdXq@0194F97XYvSgfEagyN5&*s1pLIm&)0 zu)sW#s@Jdpx0V)_XTz|OaCxpXnke8}<2`C!!##H!bEL{-fN&;TV`955&?~qtEdFC2 zlp6hZ&pzxYEx^)6$MwQ1y6Bu!Qo%Rof?G8&j_)2b1gGm}l(2J2Loc@Y@fS}7VRMnD z+fZi@oh3r}D^K?VU@32O{r2br^u0QFuw%X;pgVu_vUKIZo8MlmF!64nt>B<<{bV;x zZ++MMapU$v`p^uHfh&?W_+|9W4~@s|Fx%38wtNvayhVE-ybL8^P;kPNnCqJnx3TNi zdZ8aQAUG3oZIZ&f{5hnr?$X3YtV}&`S4iQz*12v)?4RkO@@L>xM>*trUoOVWV<9F! zHsU`bz7eAL6P=dmYeW|zd^X`B34cWDep0WIJe%Z4WL+cih~UW4_?vH|;7?k_g;~o* zt(kH6&e^x5P1FGAxk9J9OdcS>>CmU~Ss4CM)gO#}%MY)ef5y72hzCC&Rycg9egXI{ zaqf%e5)mZ1&uO@&Y_R4=k>)ycwx_hPN1<98^B>TPIPRIUSis_w@N%4TO$^%`6!^g^*)&K1qQ9>&Oz@Mp(cV-rf8 zD3gYtZb(u>o0}F-;>hS#-dEv_5gPqwqH|yA54xex$Me7JMS-ruB1_j4DOB;u^2hy~ z3t>b1eAxDq1M<6{6W=7pg)d?DR~;)kkjEF#!DL5Pu;R8!-}1Q`dclbjeb; z-Tc|XG|OO3FR`wVbo*P;&*)zLv```Op1dw2%+YzGhvl*c_ z-BIgjMiB&h=_N6DpLfyMe${2MxUY@zxM3BI&lvC042*cHwgNvLwtukj5;wl-D?TQ$ zTnKR{DXDHRzD6T{NaAB7{v+ZWA$mX2X^Fl@bP>X56CRTAN2Km2^%}{uNq$7uH4={q zjy?>nUb01205(hJ&1oJG2NmN=%T!j-K%(6dRo2+$SgMv*)HE_d|0M3K%xy0Rzu)su zt7;$D+AenReR}h!8hx#om!HZn2Nj>WVY}EU&FPhXSD08ftwnT9F0Ypvk7@p_6p!YF zwHbH6G5nAyMF%i^Z2OqhDj5DFSz+zR4ByDgp!f$2y+5U#zKhan@$KbRFD&i$gMPEL z&6F+zd(Xln6rYXt13A9V=-3PPEI7wX@kj7Z{HAiM?#FjD=Pun1cLR}oy+5fu8y?rc zRZ^}bhq52P+3~K_5G-8JXTpd_6WX#a3>?XZp1a(D1{$)YbC8%-Z_EI_pjhS|(D zYw6$Zjf2G|Mrfh6u3uPGx@h9hmOJ9B59qsoB-q7X+z9>8POQ&8xD|5lVNpo1)P&g| z6ZF^Ik)!wy-sH#mY9LK`)U2LO1lA|*ZBGeRgv~*Lc|Xmxp^4?rfK{@Kp)=pc7}1*( zH0{CHFJ8Y@1-+^Y?>U(7(}}N^_#ugpjrfm8F=ra1@M9N_Wk>wi-GlblTTMwXoHnvbFCIBYJ=;1ty`t# zwbAN*!=ISlJb{Dotl)23T)~e(Zx%nvO-N=V@1=lC$It{li8L;=g}j_K}EG6qWW$HqOD?FOvU>zhCRa70nLH_kbn>Y@8Tn7^{_Q7`T0YpZ!` zM`ppg(T*@>+jXE@@71@$_1@}Z;O`ms&?suC;k>+?T=Xi?u2xc;XG!v z$NrS-`z0S~?bVz1uzEGpuYB9FmVNVVNXyrrH#%PgSW23FHgWFK>Sj;WG-*{wE;sk6 zuFzkGn8eL)S6^<^T9D57WYg$26qKh4o(=kd`jR^1E9Z6r$&|(`<=n2IzrOv3RC_Go zVn4#r`yuD0F1_>1x1y(Ud6u?PG2mKuCPNp21FK&r%`*)^I&ZYEEgp9UJ|c6}DEvfCZ8uHUYrSt{C-dy`0nzcg>jO2<;sO`oB2iExwZPRaUI*#zsGP)+nZf-0 zgGK`Q$$l{-6-!aXA9h9VeaAYaAD!TOxB!5|tn%f@+Bsp#nlnmU-~6OkXu5q0(Bp#D zgL<8+U-XfZ-NV7XuBspoJ~&jUf&u^~JGXs{^8x8|jK_by4F(eqGBJ?` zfgsEY&fRFd4z+(=eL9c2j$zD(+5mMMpk8umRlMSMghHE=mefa~eOdhv)k5rnZ}6G9 z<1%)rG$>$ODX$t-eALR-;GvC(&!70SiLaLUA&HNT_>YKhgy{W5rzQFt(M1TKO?XJc zACbDB)N3TqCixLr*GN1fIGSnuDEWbnITRDt^j+sl@v=Q{H{MHKiH&NT=6T2%0giO* z8{JaoNJz&(yZ5FJnk%BeKv_@}X79ewBfe{#?jy3gkUvxi9*sSrQW?aDPo-FGV)*>; zri#-U{%ngCp;s8b+NU#~xHJ5a`K1r-89uh!md30|B{quxxt0(Z=LYoK3w}^~KgFYO z+caY#FFv=#>%;dQUcmgc+677%!AxH|)G0n2zxriVt;6k$6yweIQv4AXd#Ze8=$JlW z-SDC@_lE^g$jv&($g{Ux*zuE*A2km6H!;?=Wv_YJGYoXl61V=lE^8lsF=v`&L%t4J$?>c-$4nWa6c(E-9YYKhgy{W5rzQFt(M1TKO?XJcACbDB)N3Tq zCixLr*GN1fIGWWiJ!j^a26Rn$GJb9OESxLdVRvGtDzF{2-OClLi#FbCh)%yTA2_^&qc?6*C^11S z2TN8``WnuaReetBBJfpKWZJ|nf7EmB0gF6U)Gx7X*2*&MWmSaGS9$RTJ}4o_|s~axo19eo{P`!`M%dg zaxPq7=X3tX2`jvPe?`jE7iQS3-#=%ajw}w@AAig(Ru(3+x9o4@-v;JAP>A*=v2jdwZpH4=JG>A)x?w! zU)5i+$$5_`OxVqFcN5ij^nPJ^V?5O_La%Z0X88Q?^}9we{MoZ*+9!Rr+@qXvlphic ztd+K1qY!|!ySl_E{}F6^y&~;JfjZ{B^>ka?7k|X`KwFg3X(7$1O^VXju;^av_sxsl z0Mk{}G}QN$CT^2hQkV_UO4~fOFUOsM?fT&hjJls+?0_1hUeiroJ=2iNv(GKE8XHRi zpt3@|@c1)pFt+{E2}V4U|6sa|fg?$~XV+z-hG+vv)`#@(R|Co1x;8DlAc|wNUzD?z z;^%U5f)=Ykrt`}99)sUF;H|cg`qw#M(6mKvo1+!i=;8{kngTn0;Eu`#Wh-q1;9&4u zW-b>CsH*RgF0nutySS{4yocP;ce?#e-_@#c|F_`6feIDK8^!xwOVSj6sn;#~{9PQ< zzFmuJk(i+IIeUFH^prrMfyHZgKdGjX{x#BfMEXUD&!70SiLaLUA&HNT_>YKhgy{W5 zrzQFt(M1TKO?XJcAN^7HPt|KA&nEd1S=UHBA~=%Tb*DCL=?E>+<;YTtp4ar2^VPP+ zHwJ0#_MsQS>vTH~b*p)b+>3v%;XjrK+P(RRpDQFBwBK)&jwP$2*sXOu@iJ zJ$_Pejzrjw^3Hj?7ZrT})a9YR9j!faz;DPT2#scjP5L4C4c?m2>~=#J&-fn;yW@*| z6u*Qpd?QuHBfVc)cA^tlQ~hi9CdASCWKv&S{qs%Xk}P)=mc}hk@!7N?_FM)JN$)Y_ z5S;5gRHJ)r-M&*)U3S91LAh>$1X$F0@H~}g(;mI4FQf7!y!C8iKAVXY5DEUmBev2U z4E(^tDTR_K$1`IUeo{hz`7u|TU$&n1h28v6+n!z;P_1#3cqk9rm(22i_*fXcyL{VD z{?Gzs2U&RE?czik{9Ln6?Un`FWm1bS%5Ma@ad(X0cc`LAL#+JwbIp;igD3nDvlNy- ziyqD1p#*>z+|nTL0vl%qi1V_wIUh<{w>7d0~X72j>=sBH3T&ZX095K{cH#F{kj{|LgH!x-*zYKB6 z7OuQLJO|Vr&f6Z^u^N5rV_GS=Vl^5$JvTaVl+qW9F1$8mbpvVhuSI8^-Ub93kFv@J zZ2%|6jV67y`O?Z(L3Wi|WRS3|T+mYdw7U50@4(~>?&(!WOfj!3@<@%a;fHu2RGKP2(75&seK zjS#(`=(I#%Bf1FTvk4DL_#;yHlX{Kh*(5(A>l%qi1V_EW!vpq3Uum{CANgE?5=gCK zTUw&-3Lv)gJP(!8tySGc_sfOR zcYD^C_=N^(Rg?XNrdq!>M#EVZJe*gSKD<;2TjvF83PEdfKsM!R-; zw*PiCDxtGMNmdV(npO{TJDY>6YNBR>zTA+b-=kYea|s}Q*`%MA^g)vTHPUxP`bCJ( zpZK$hua@{BiI0u=kBD!C=>0^eCHflCMF^iwcu2w@k-DGMYb4Jm`4L&yNIW7qTE%Zp zmrrJgx4qiAZ+w1APlaMDw2azlD;I81^k4gt_F(Iyk5peaK48+#=%>Y{1>F8rA0+fc zEP_=38VoIbJVx~$Vcv0H-Zxag$VHxY2ZaQ!(1Ox$9~u7av9C|c8NS+6$rI5Nx7VPz z>yA(Q*j_J+DPZ`IhFc5+KVCFPhx9HfG4%d|cz7b|E;r~<60UR4)(4T|qwbV00_%mk zlPEr$CJX30C?1m5N)x|QzW#a5Ct?5XyNWf@xx2z=sCo@vDSO08<=HT!JY9gwk7yIe z(-a2mSAc9qpY&yu zep=E8N&44F-x28-AwGZN&nCWF;)f(YHsU`bz7eAL6P=dmYeW|zd^X`B34cWDep0WI zJe%Z4WL+cih~P+HB}BS#RS!+#LzVYtwaavY*|(1rCv-zas z>!TD7p!>}n9~gc}H?OKuhL5d9d(23;BoKXGu(_Y%8+jwc+E^MaigaC%bRAp`5j~+S zh0@pPJ7A9&<3#af;cd^Tx}Pp_0E|)f8a)=e z=TLbzo)^w@{z8&C+V#WSHL`69xKTMPZvP+$Iw9{k%)pWSpp9JR{wkXBClA|8XB+5- zTNZ|}y1b*cwlyp*ef5}DtM0%dYS>A$`KF#car+xBbz$;WpNI7{g|l5A*PKnk+Nxrq zEqQP0(Z<{d)aI(49!+m9NkR^!x<#}C>5 zm=i(z`$=Cm>8B-qkfeW&^c|6Y5#sYF{%qo_C4NZaVYSd|;*eX`#XQ?GLCvNcfC<8J*F;mb-cW^=nc|$Z7X8Ge*Bik$;kcveka{hWOBCB8(3^n4rwsqm=mv93jW#*(Jvb?9*p(IZ z%SgCVIxQ_O9s5!G8qHipLXuUbi_WR?oNjbP7Ze@({`v4zB~;NS)JXA1aMOMN6sqor zTki-k>NR@FasSzw8H*8XLb;2gDGv~f%w2xnd@*SK)_$K6kJeVJYBF$?9sKdJx?&F< z0VDYnpRUk`-syf7u{FkbWE84%E}3E}|FO-cU&Zm(LA{#A7nj1Gf%}T}@AbgP>w{b^C7SQsn=(^S+ck^-(oXlh(4?@H2Xm( z^G3*=2-4qA`m#wsE$M?K{cEJ}i1dpPpFiSz411ra@>BChXsiw295Uo*!+Cf2x>NoA z_?$*%?M=XkLtkb|Q~k8~ah}}^Mjxcvie5vvd;3u+$2~4a-%(-Uon%Dyi!iTg4DPZG zLQG7`srM;=c8{Omau3Q^+Z-6BGT+D@_1wGtcqQdyd%N$fU<3yj;`D6NTayLBXj)nK zhV6wk&DNNHN~fjIJfBre>1(w9@nzB~ifUk^lnq}rQU=wE(H0aBN$YqlxSisUaLLh$(JG zb$5s+ZCQvs8?%WOoIR(%urP5QwDa$ZpykRzyC1VAWPB*xq_u|2-_}J@v#yA5%G->y`Si`L|-Gi2;s8{ z4@vkVQumX3jpW%RKO*ZIiAMxSY&pu4bBf+n-JP6MWMOe`a!%1|srbn`MIbKo7h_J* z&hm(F)SMz*_1$!_+vk12!M~g_rwB)$a)_kn6yf#r!u?7U4ge9md!f{vB0L{7np1O% zut=Sl@(uqLpvDC!QFDs0mG_6^Hw(6bOR=4aBNEXOolP1 z2t4|-D)_wv%H!S@RQbXkbwrs7G3FGtM5^{P<`f-Xn5xg1Q?$0!ni{=liHctSMp4_i@QQnITNg z_o)QS0;u^u_^Wupk#L`tDC2vz1U26WlKDquZVj0?Lgqw}{(jPzP5Nm`A0+8tBYj7t zUxfJli9eh8YKb3`_}Ga5i1E z9)9-8IYl1TS(9^$A{`wk=M-6$Xc{u+6eXlEKc(grVW){XuSWb%Aj8BJj5$U4_Uhf2 zsX0YBN%8&#+V(Vb$KCZiHKzy%gxq^W%_+iVr>{JB2#rLAKfiELbBZuE4@+`1`t}e_V1y2ZrB_Fy<6J zO%k$Z%qhxv`5}%mr${fGGcVoF7368H6>q9?L~qpXuE!~ufTevE?Ek3=8Z&+DOOboFe(U zEv3|)BB(7h@!(x(0r&zqWvDqta8?iRH)>81ym$PuT>qY2VAG_=m{SC6&zpNtbBbVb zU+TSg6R|+2zHSdSrwC?t*cLs#d>CC6Zt_LeM^UTtyjP4lMMKYu8W?km><$$MGUgQ7 z$orUGl=22QWX+cD8ubE8%aaQkbBadhR>?8u6cvWF=rHCK$%El@DW`nVPJgkwb+Yc@ z9Osi|qp9nVOKUnGHKzz?+MN}k<`lttla3^6P7!=tUOqw1DZ=kX+)gm&?+Ei=@MFy1 z3Hh-;a`LA2!0Xh{Grc=t`>So=-8k?f^G!&e%xtsU&CDl`)uqX zu|LAxk9iG!Hu@v{*HDk}j`nPR+RZ7t5dXTHQzRjtD9&?=s6k&l?aiAw^6P`-F`iRI ztCJ<Ri4Sxhq5j8BT=<#k(5wYzzCWYq|(W}R_`u@>~ z3|CfdZ%K!%P0+LjAe2xF4}wc;9Nwm3q_H8@##v*t20gr-bMcBocRSCy&*WhVgtK z2Cg4?Y~W~te*|s~cq8CMz~2vFHvF{kLBhWV-x2&G(D_534P7ntkkGL~e+1nK?)|va z;=YEv2=>|7Lt=k~xgYZy`fT(^_^+WJ;T=64QFcn;6#c%E)Qjg7(U{eFLvv% z(;<%X5mTn1i?Mk{jtN3?yd;+KoFW?Lpd&WO<`}o3)U1`~6w&yK zU0p?6c5{g@^v^xa80;i}_E5P*G44$`<6td`KpExoy z@9kQFQ)F6v>ZZUc+84x8fm75K9qB%Be>gWf?$ex=+XG3!&x#M>qx`vm*PDm)oFe*T z-hMltQ$+hGcXjfdBHAlc)|KZJ(PxGQ_XYk=*j~x&0)JnO`1|3@hMyKbNch*_JAz*XI)CW1p{s=+5;`{M zkDwdDy&rd4+}Cgy!9E*%NbHX=_hVi|pN;+q|25PjyrUHHpBe(EsP2n|Wk}8rBIoho zqQEIS)8O3ADH1E5)iUEcW;d02D-$(Omqk*C%AzbD-?pcgDazWw%%eYw(nGp zz$vm3U&INVqOpc!P6?bM6SKDAJb#Bhx?cQ<=kKsnW(R)o{2ey-hpp>(p1;%l{EL-) z{YrB5(;rQNzw@{KlEC+2Cv$Gp@O&TEl;b1teOTVYgB}9k$0E6XJkts%I(4^N1ilYA zqu@h=>jxeiI9lKzfm;LK2sjb&_rsSBKP`Na@UOvl1iuJ${?KPbR|`EPbZpQcK{tYX zKkl@+ui-9&eKz)x*dJl;$GnC<8~qXfYp6$fM`ks)-JBxP-q4o~TT_Xc!k^X=jEh z_LMqZxt2Th&ARG&@gDAdi|_INJnuy#ec>FxgO$X){Q0v&Nq}BGIHTZ0 zg6jt!8#r3vAAwr~-Uv7m@b|-)4L>b>knpd;cLcu(bpFt1LsttuBy?=hA3-;Qdq3{9 zxUb(qw^CB;I^;u(od^*{GKKTHq8#J&|u*pt_Aa6-kzzxSGn%IWZ!T=M=F~X>qnZ zr-)@wuuSARMKs8@DWufEk2r0X{!;sRA1PXsd4=Z`v719wb$Lz^>mS&8i{}*4Iax<5 z^)Bz=+P=hl+pG&EYszjdUXu{T)lCg95I98!R8im3-^b~)=~tt5KBVnH!dHRs1MV?+wcw0`4+*Xxcx>Qkfq(Rm zTk{`p1e^%?`{B!mpB6qy_}Ab&f?otWf9SKJtA!pCIyUH!pc}!xA9q^Z*KilXJ{x;T z?2j<_V_rj_jsEE0f9=0|gm89b+mCZ;YII7PIeK3m`vu|*m;*XwCT z5rfXIa;x5lNv-s`T%J=z&sAIfC{J6&dVcb);yFd^rsK%+QwJlt4LL{MmKSU#nd7Kr z>T-YXTPd3?aEj_SD*5#0N6r^S5@cMIP7zZ(W+QNlSoQOn zii=Z%xX(kL=zlyGNQQnJIJw8`ZJhj~T7grisqeoT7Vvx!s(i#Es8=1x`_v(T*r3sbsGImuFt1N?ggreXc<|x*?>l zhsE`eW}aMA&|>d^@V%Vr*!uQMD$a!U`n9C9x`L=STZBa)-#~f~TrykW?`+;xHZI{} z3<*}Rs4$tGN0w||Q?_+$4Yw+7OSb#g3uLX)cB7X=&T^*v%DeeKO_H6&<-vx39Cq-t z!951A7MxM=A;I+nj}06x@Q=W)0dEAH2>AQq%Z8s8K1lf2;5&j}1Ui4{v!Sbn9uhh> z=#QWq!Mz`MTHM!g7r{Oodr0h$F!y6#L!XWQ2>&(IBfO)qM^_dIoFbo}d$$OjqP9zp z!{?D??&vSgK?0{}?fX3^Ol(5A+Aq~^F;ywt)w-U2cuomQT}oTAs&ZjAz`$m4di^l_e36!?6Xz~7={|5B$dSeW>VT zX^6o0IW%13Rncw>ZdgfNmB9A_&mSCi@Uy`^2Co*JQSc$b^#hL$94+vVz^wsq1e^%? z`{B!mpB6qy_}Ab&f?otWf9SKJtA!pCIyUH!pc}!xA9q^Z*KilXJ{x;T?2j<_V_rj_ zjs6J#HPj=#qa8LK-JGHx&i0cu-7j))6|E!$PEn3hubP%25uDAnm;Ia)3@`S zBI@~7yo%=((ZLr>1x^wBRewj+-#e15p3+B4rznbC%oADjoFba_)5L-26tN+}bpoe| z4Tx}c{~i&>^YUNroR}OyRKMwk>9FnO`1|3@hMyKbNch*_JAz*XI)CW1p{xC;hx|{+2K^CqBe?hD zPK)~*?jqP{V-Jb_5$1l(Yv{AlAK|}-dW3g$DYIucrzrm68CRnx=SaiUZ-zXlh@E*W zTFP^Z*ljU`l;luf&c1PubzZeEiGQx!XS(rbF4{_7;1sc6N@K(h+|XoN>PK^|gAQ@~ z`+ARmd@PGgy%;$@xiy3I+u`5MDSGd|qnlF{l=i_`;1oR?{Afb;#Y3D)!X7U<*|}t& zuKKDKt-Rm##`1I3jy&&T%zv-Fp4Kvfxui7vZcuo> z;`uw2`rZC5@OSJ?a~^uJOJtlv=Wc@f4m=!foA zf$!t;SNC#9L?|(D{M^m=K~4?wMUV>tojxeiI9lKzfm;LK z2sjb&_rsSBKP`Na@UOvl1iuJ${?KPbR|`EPbZpQcK{tYXKkl@+ui-9&eKz)x*dJl; zH~(*5L!XWQ2>&(IBfO(~Z3f+(BJsK5w**enk3*MU%iesiH=%Qf{}i#_T&r@f$!ZS+ za#-r#H-S@>Q582r;1t~xyWh2|&m{fcLlO?&_<(u_mo(ah@cqVX)N{^^d5jaJn z39b?Xr>Jzyx2UW0rVk~Pm@?To|PHHB0t6w zlOEaQJ*TW7H&tx@nm(~6Q(tuReb~W-C2gK%OaA2@A*TlUBFKdR&mSCi@Uy`^2Co*J zQSc$b^#hL$94+vVz^wsq1e^%?`{B!mpB6qy_}Ab&f?otWf9SKJtA!pCIyUH!pc}!x zA9q^Z*KilXJ{x;T?2j<_V_rj_js6J#HPj=#qti}mnF6Os^TgQV4;z$8`&4(iV>08p z+XLQ9_Rd^No(8P0v zPDu$BI7On9R_g^$QK^FI^vxx^6`lS#Sl|?i|DCKDc|?qS z(9lnczu%8DI2SbH)khVQ{`ZMG&naSAa&h)Nr$|(77Hi6Lidf!)!%;k^NHoB_=BmKo z(byt4Md0r!e?40v@ON5Ul@{>)9nm1GyCz2rCU8!D#035hJ!o|Ny}Y+$`iBA*TlUBFKdR&mSCi@Uy`^ z2Co*JQSc$b^#hL$94+vVz^wsq1e^%?`{B!mpZ1>*@<0C?d`Iw$K<5vAHgvVnLqf*} z{SkB{xcB2ui~AbxBG_kR4~hK|=6=j;=(Euu;lGA@gm)BCmAXOT6nzYARuVWxzR?Ru zj$JyPTQ#t=SCEN1(RA~9BXElD%xU^0aEgpAmAg4bXENJDEJ7s6-QM=kzo~vKxsiT! z>VaE|#5c&*oaYpkdiClcJg2C1Pkw~JDJm@=ej}Ub6p5})yw^+M?=11Uc3$A`Z0t4P zU*PZL$jq$a`8!mcrky_!qR(-*pN8@Lozi!+$8__3X5D)4>wERp;n@O}2`)JXDt zA9kbCAhG|-S;SL)z#*RRBSMZJ^4E}?g}fu=)F593xe(y_gToGfHn_*&)q*n$J|wt) z;IVPBe_%5#P{glcY47cV)J-TQR$ECb9VEb zqEgA4wkbTPh?W}mE#x^xEGcbHt-#-TR<Ajm*HD7(2 zLCoj3P@cad8sVnb&G(snSht(+V>8)vy1@5QF5E$Rz7O^Cb}T!3*O()()-L4vJ`8z~ z$nit|8gjFccZ8f8R_ZYlda7Mw01lJEdHgL4SKLWP~yb*9B;O~bo z8-7~&AmLww?+AVo==`D2hOQQRNa)z0KZ0%q_kP@IabLq-1p934A+bNg+>dz;eKz_d z{MS&A@Q&h6#B_6tuAd!TBXEk2dh3n8Z?}T1Jh{f|5$8x=)L(UU{q4@})%OrMMQlTl zlLDuRI&dO^Q^YE>HrFT5GbIOJUC~+kX3! zBUAsj%=zZW6}{h|d9j~6Y1@$`@O{`_z2P+rD~d=cbA zfaebmJNVh)9)nj4&M5eh;QE2b296f^N8r|gHv&!s{QdA{!%qtzB>Zde9l&1 z(A7c@2^|~sN6?Mn-j6#i?rXSNq+n9%6YbBaV=whjWPh_!rKJE`1y3g@9W z{zSX-9B#vtIl~f{PatnSrTY(Gs!b9Tb*+zk`*2+;olkn)jpSB{$w^AwvgTfgYNYg@ z;ZE-Dloj|pEReZxdE9dqpZBrxsCQWe=c?Yjs=4=mF0gveu{rUc+@Q3pJrZC0ahn&c zPT~1J?9bx9!|TVaBgm&kt|;;#k>iK_HRNU??+7_H$QMB_1bF`cIPCxNv%x(EuNIt9 z@FBtV1CI?HE%1-PtpRTYoCx^);md}f7CuP$*Wf$)=NI`;=MQ~0bhXe!LdORE5p*NC z_v22B`x@>d*k@x8iTx4ge#~p=v(X>nzlM5*ceJHR=CHsi+Wqd$MuAi0`eg7Lfm39x zIei|PJB%E9`Xl66!UXR2jz_-LgN(_IPveL3oFdj{GiC_SDH8p9xGj(86tS6?t`MG6 zM2lA#WDESA2Dvx60)JXd>`ggZyCI=&noVh&8>Q#@AEI`82Plw6-6E-a{Q3LhTJUV z9U-R%`69@L0M8#BcJQ;oJqE89oKf&0!Sw@=4IC}-kHD<~Zv>nO`1|3@hMyKbNch*_ zJAz*XI)CW1p{s=+5;`{MkDwdDy&rd4+}Cgy!9E*%NbHX=_hVi|pN;+q|25PjyrU;E z7y1aCqA@h9OW+h;jeoDAm#)E`8L+B(&Tb7dc6Lb34~zN4H?qGu&naRD>-!|~oTAc% zm@7Iwr--@i-*AlQ6p7l;HAV^io$(SM=Lr0rfv3-p7x+7NH9L->2@~kHZ4rM`upoYJu-#-}74<&-Y>ZH$63i=dI&h=iJ%G^L_s1 zWh3Vp`LxItMIIz_{E)wf+$`iBA*TlUBFKdR&mSCi@Uy`^2Co*JQSc$b^#hL$94+vV z{&8#m5tDatA}R=X;IKcCL=qs6% znZ`0}$g(k66+GXEA$J{l*~mFYJ}q)Zkq3z!Kjg0=Hw$@3$f-fT2y!97^9P3={A_TK z!K(#l6nsc<{lH@bM+^KTaBILD0Ve|fe)zKCr-cs^{x$fH;1_|;ANp+QYN3aOjt%-F z=tgkw$DJ1UHQYt8&&D1S`y$^EcA9@_^<`fw` zY4yF3tIwU-bnaci5ji4y9}_h9?*KAv?C0x0t0r=W#-i}CeeH=$SG4wt0w0nge{n#+ z76&rvuUdQXjFH?&Go`lzf2Y#&?)S-g3rIpp*oQk zAxEx-=yFjr_jL1p##TpPtbDqV8@KHA;H!^~IpnS*FB>_>$frfFDDohYdt9gP#rVF?hA$jDimdt{-@8;Anw=1a1v@Bj7~9-w$6l{Iu{v!oLRJ z5&R<1`9q%#T`lyG(6K>(1l z=;joC(-!aM6ulqGJ@8sSgB;plIaBi)&tKJ?|9q|F?-Fa>_GT>EK7nOhoi2_qY}T6#Ni@>ROy`Ug03 zo-2%RCJ!{@8hsZ}mGssj=U!au=KJ&t&+K_EL7#MZD<1ykF^#ki^pxsqH2Igij=XH- z93!6=xuVE}M2;Ww*N~fqyd&h)AYTNz5a9WP!w!BnxX0ktf-?#}B)ERyv4Nun{t>t} z;EjM20e?Sy+3?fC2MPZgd`Iw$K<5vAHgvVnLqf*}{SkB{xcB2ui~AbxBG_kR4~hK| z=6=j;=(Euu;lGA@gm?7t{^f2?k*(_Ngk$y7xlvmN_^AXdkmAv!=YD%7%l$PS65sG! znk#Mp6nJ5x7x#Bl@hk}qXU@3)ExF>gQ%RZ2fqlltvfQRX*>3(0w|@TYjeqA7F6YLx zn5Lz~BQPuF&1rpdPX9;ohNnhEa{Y!l(~22Hr~mwJzK__t>66bJF6Hig)NkXtH8eo3 zhrq3&(H^Dicy0}Q@n7yb^0JY0jC@+;iXsmZIey4rLv9xGj*wG>d=cbAfaebmJNVh) z9)nj4&M5eh;QE2b296f^N8r|gHv&!s{QdA{!%qtzB>Zde9l&1(A7c@2^|~s zN6?Mn-j6#i?rXS_93^NfQB$~2{GcuYrP~iJaRnq&* zb8D#d;weo$w}!^PnU%wHYuKfAr@MG=4MXla^0JY0jC@+;ivG)k{4d83`D@6{Lf#Q_ zYLG92TnO;|!C?nK8{A{?YQY%=9}-+Y@Yul70{;ly8t_KIiGaT!zHInu;e&*K4Zb7z zMWFMCJ{!7P=pmtFgZ>D*5#0N6r^S5@cMs4kmuIW=0NpHJhz6*ZGPU!b8A@Owqt{MZVih}`dQ3#YZ!9Zk(Z5}W8~8! zR}^`W$nit|8gjFccZ8f8R_ZYlda7Mw01lJEdHgL4SKLWP~yb*9B z;O~bo8-7~&AmLww?+AVo==`D2hOQQRNa)z0KZ0%q_kP@IabLq-1p934A+bNg+>dz; zeKz_d{MS&A@Q&JtuXPqUMX^4&ssv8a%ZF9Hc>WG8o%&!5&)=b2-{dLq{2e-=VdE)* zzf<$wMe&^p&--7oJ4E2`^b0fi$@6_^)4mx3--k{rB;pB zBZuDD-V^vfnct@W;kh-X!z!DM4^NR|=_@ne^4uDF$Sq;=O(IL%){8WFZVf&3pw2+x z)*yEsdD+N0Mm{ZaMUe-I96$a4^4E}?g}fu=)F593xe(y_gToGfHn_*&)q*n$J|wt) z;IVrg~ z!<_H!{>pg%j#ydnqd((Ekh0_^W7N&j=n87><{vCfQ_B&2>%P^dN`l|GWlX8y z6QwGjW8#mWJ3)}Uj=XH-93!6=xuVE}M2;Ww*N~fqyd&h)AYTNz5a9WP!w!BnxX0kt zf-?#}B)ERyv4Nun{t>t};EjM20e?Sy+3?fC2MPZgd`Iw$K<5vAHgvVnLqf*}{SkB{ zxcB2ui~AbxBG_kR4~hK|=6=j;=(Euu;lGA@gm-lC>}Lhd2PxDg{*89U!wfo1s=>PW z)fT!`bo0#bkPU48ou%89<^~cU-@&evUN}&%4FAPl2Pc?460dz&|K+ z@2MmB*3%BIxx@R7X?D^Rv2Tki$!d?o$9R7qL+(2AvXOI)d|Kp+A`cQde#l=#ZWi*6 zkW+(v5#&OE=MN4$_}SndgI5dADEN@z`hmv=ju!Yw;MRaQ0!{?{{qSYOPYWL;{A=(X z!7l=xKlItq)j|&m9UJsV(2d~Uk2@{yYq*PGpN%~v_D7ieF|VP|Mt_9=8tM_=k$#W! zRok~@Q^$2tBZlX1r+rL*Jr42inyxvH=R-*+DEgZ5T%Tns(i4C1 zJeEOgBi{%djl2qpB7uM4ZF^hrtyASmlgvjJ9p%=VpH}C6qHM%`O~Kzsk-Lt(Y~&mx zpBA~I$b&?VAM)3bn}xh1fJxQa~$NO#K=H|OS zA3kL&yZEK4y-9FLek$ZD@iD*5#0N6r^S5@cMYE069c1u^8Kz(3fu&i=gMTbH+aSdy?io}`Npye{}e6RXY-68wG0T}NIv za*mNti(FCUK_bTw`D@6{Lf#Q_YLG92TnO;|!C?nK8{A{?YQY%=9}-+Y@Yul70{;ly z8t_KIiGaT!zHInu;e&*K4Zb7zMWFMCJ{!7P=pmtFgZ>D*5#0N6r^S5@cM4wg`F^oZvx(BpfvCux2^K_vC23VvqhWlh?6A1XCc z|FgZuIfWp19eLTvIYvG$az&8`i5x%VuOT-Jc}K{pLB0rbA;9wohaLQEaF4;O1!oj| zNO1kYV*^JE{3CE{z#9Q40{(vZvf-zN4-)=0_>SNgfzBWLZ0KsChlGv|`XlH@aPP;R z7WXyWMX=At9uoT_%>9_x&}XAR!ha3*2=C~`xqIF7;1G)^TEaZ&p-|u?%!87x+Nb$> zkot_jQJ)%NOin56bmHeh_GpLGrEbpa1%v4VpVNA;Yq-Ew{N1%EM&RK+H0oa^aLh(J zjeO1Xr>MnT@z5BHyM#pdpU?ANnE&MTFFdD%e);_7(bVm?$oOq-TX|nR3*7xEk@qvx zYqKPqWxii0mB;s$@%}-E+;!w-Bj*_Tw8#}j9wc)7kiUl9EaV*_rv~{V$b|sU9~^e@ zv%x(EuNIt9@FBtV1CI?HE%1-PtpRTYoCx^);md}f7CuP$*Wf#XUj#aT=(C}#g&q<* zHt3I_8^OIFcUs)na2LTo8+%CXk1+RRUPGUa{s{jy)FZqjxs+!&g?aE$z0PQ19-Q|g zy^fy;=`7hjfv>Ob;~ph{8N<(mY}F0<`6qc^a8}g=rFNe4nm6_3GF6_>S@$VVR_ZYlda7Mw01lJEdHgL4SKLWP~yb*9B;O~bo8-7~& zAmLww?+AVo==`D2hOQQRNa)z0KZ0%q_kP@IabLq-1p934A+bNg+>dz;eKz_d{MS&A z@QyZHDuoO4U{|1I0Y4AYcIj!#rcZrI(4g=E{5;5pxQ)H4ojHMesG8?jChJlo1=&?R z=an7175CQUwkiAATHV5P71_&ATcq209^UQ9g}payxlTSkO|TL8Q?-XpmY(6cK~t+k zgBN9ACF8uq%>_=!;`w=cXL-KD=QwGD_rETYcc1$_7yQhjqj!D1-epG|UW}A4kex%2 zyNKzxowjOPILRE9tJ3&Dp$R6eDK=a}90`WCyRB%d4B*HKg82F{_`p`Z$tj)NvTiIQE_c-hA1dn~H%*K84~mexj=XH-93!6=xuVE}M2;Ww*N~fq zyd&h)AYTNz5a9WP!w!BnxX0ktf-?#}B)ERyv4Nun{t>t};EjM20e?Sy+3?fC2MPZg zd`Iw$K<5vAHgvVnLqf*}{SkB{xcB2ui~AbxBG_kR4~hK|=6=j;|N89z`Xl_;P>=AA z)`;eJ&x1WPhrji_*|%)eI_`ZAZ z8p+x0K~vO9Yc+Rb8RWlBxoQ@h<1}ca`s&_n!;>M_<>L&wnBUnw*8CP{Ctqu&HSTLE zRkR;IbVkTfPAcT`#5|Q3rBY)yD|yVE$w{l6wN08cfk=!_H!)tVO>azTWU|FW*{ny4 z=ZyO}fiBoS(kyI$Q;FRZ{cI~AIWkoBTEv1Kb8t+`RYZr|^JHjSK z{;8f}+4?fI(jO&{JRN%6$*M0ovC?4GoaZT}Yr9^J-n2JcWH0~Imz+oy&2U}2$1U@4 z>5!5I=c+!jLT5g?nlqD^)wij!bFa5E2 zWc{V18hY#MQrGNg{UI7(HS$8P&zVxS5#EP#W(kv*?Te)V2D?uguSg-eyCE_avD zd#v@RLz?#@;sn1yR@29Fnmz*V)nqQ)YUs_ z+HB9vv(pY2>?{cwu{`1Aop&XbEo(jdxF@iV#)z3FQ!a~?v!!IO6m1vTx!L9%Px`6* zy~ti%O2?$6;6sx8v&KDokNVA=)i!cI?Hh5ft)V=$wE26p$AU)-O1`T`g%+N2DUdi$_MvYk?yO(vW>}B0`@J;FK(dCO%CDrw6d-quuaO$TB? z@11i3jH&;)BM%@h~%NgyG8o=e7897>f-9i(qf7gYCShBJG^nS?%cv3R=k<$G^b2z<+olTBTv$*U= zK4SFL3cAX*uGe8!d`{1;vXk!Yr^r?No$b3|X$H%V z7QGL%|5@65rQ9U5kUG&ycg@pj=RHb-6@GXw8C70#_iUiwpNL<2r&~08sWq&oNB(%q zM7=2&NtEa$>U(FG9;xf~+pl15X+h7md9m^P*_F-f9V1ms*m|1wqV>oHM!Y}imYpr< z^N;mXH$2rMlbmi`km_}kjar()kD_s<=hFh;`fIqe)t@iU3!Zk2o%NSKGh$m$7L=ZG z-gBCt-pi(Ow~bO|DC!a3QI*;BnIp!(VKGxPX1ZIwVEbK;-geijW1&^O@BDG}qtniR zRJMB;N!%_k+;lr)2UU%cs#6VH!MIOevYkc_Wa`tlzdIt=(kL0ZvWLU0Sl8=@T<+Hj z;-#9-{VAGB2Y>Wzs*Lp{&UD+sU%^|*{MhnqJKGM@%`Y~s(vVoiEZ&gI!E1u4mYCxe zHDy0?;QQmanSU*b_{gPeR~!wdAI8q#9lUHgRnO|By(@Pb3+(6c>BRSK-1*XxIvZbl zlC-TKv~vFJB+t%N&Xls>!aj*@n6`JUGhKe#w)W>Z4|+Iyjip`fTBb7d|MjR!V}JKM zG8^{A_1XRaRi?4Z(}>+(m7S=YRI7K3XmOTT`2J|O?{`;t8Y^0wkdLG*+9 zov*_tUMZbvv`5KHMw5m3^)f23Ga??dkEp#Ws-nFECVxo` z@5v1btY4S)v>!=UAH3#Dmp|M7eaU`(^JZQ){Xjfx_6w0;(AQzF9=DeqthA>2tIfC> z#UU4^M{?}U!O8br{&bWMc%Ib5Z|x9jV6s_q`@>fy?`{qqb=_qk8Jg|AZNHWZXE|+3 z$wu!)dN1vB@2ILa(bo6#Uxm-;&-TlAZaVa_R5ZEtLsCRQAyZ$No?Kmji=CN2d9MG7 zO6FF3ICb&$JJj%Cv4ly8H5r%j%=_u13#_eu*z#FDhcn3oi&``lcCsKZMU6e@^H_PY z?YSY_7PIuC@E)^LvPve84@_Ct!;a#=hI)i|)ce@tj)7Uv*@)sB$#3p9vckb5@|-SN zQ{NQP&Z4ykNSN25m9ttmQ`-Zhoi00XV>a(L&Fj4}jHGq$l<|KXz)}xySY8nx%90~8 zcWtFET-Tg}uV&vP*sphaE7rbWN?L~xjlVfy3z@6Zv#{{+UizT$kmixogt>kkU^YH~ z9sN0I{=M>AXCjxRe?RSNC~>`S(bQNuh3O8+@ciKDPoF%Cj||ogV4?Sn7v;r-a$ihT z+sydfF+-kJdG2+-neWyMN4Udi*{&vryA*BG?Q(f7Yf6Lfs1QtPdQ+S(e_xe#tcVCh|b?!IG0 zMY(u6a|qrVS+jZ~m5L62INe>1>IL4}5P4ONlbL+zvPbXv&_m`>n9<*Mm5tYJlu zt~%K}w9hN`@jA^moI&=Gpg?n;Uq4VvYySB)O#jo~er_XT4zQ55|V{%fd5ct_q9Akv!MT`*knF050N0s*%jQST?of{_@%Ok*t6-`8eApjJrRgAt_*N z8Z%4#Q+@o!TyApKU(2Y2=A>W!uL1@29klQ8eSvmjqv@i|w*I}mR?z|3@+NoAhLG2L zLu%h>ttaLt&D;8l?_kHDA6Xb{8BAr;{oK8O@!Y1a~(x#H+_J>zzSC+(X?d@Ut!)%c&FUmV{?-}>V~Rf5i2XSADh z9-Up>V9jz`eO$Mvmz5fOqiO;4P=ezHJjNx{p^f5v4zo{4|~q%Di6<_cw&7a3%BvfeYwSe zTCqO~_0JZt>EQ)$o-MQF!sL7TB(AU{L#jiM7B-u5hT*46p6-mJLjtrE-x!#)3+tsW z_|CInn+h8iiv}&H-6($j$7U#N zldnJIYwk(svit>6H*dKdVhbb;qRNNovxJxXZm7tWG0ERm+G?rEEUD{DOZ2P-WMk}^ zx8u*GQuNvAkMLhZJ;FPh+cGV)QS^qz?i^6x&);&?U**iN=26a@S1X9!M$yL)>t<2YCBZ`_=Ejig*~elN_5_ln z;^~|345h5XFWoc0C6X@jbPMzdT}lt_99=wQxjorFYJKOilQtyW+Ai}?rUTulZmqU? z#3uUW=E1ZVch<4kr!I%=*01E+*UnXxEwCW2PsWrC@zo_AgDOu|&}D2=PF0T|Z^G%H zCGN`fs})^ys^-B|_hIyW-e;lD7X6gJI+O2@N{37Awc`J4A~NLuc)lLd?@E(>_;*B~ zj`ECqCNgBtPruu@_(WyNk-H7+Zkv{u9*fa**!wq{UF%HiV>w@qSnU=4?7c#TIbSf$ z+chnnIfZUaSaov*$@BUwom4lBSxw6>cY4}b68+M7YPM9FUbEu;OZVH8O6M!wlTcGN zrK7$Lx?VrZ(RQz-?6A4XG$HMa(zg3ztki4R zim{2u^hOn&lwMi%O>c>Z<|vzj8zsA%GFGmB+>0GAS@B4xPdJ;e7`rJ;B~Ntun&sc^ z`|gR1n*$_%R$O2qgF6S&HO1^zlH@xTtz>q}O}Fdvw(IowpG7lt{ePAi*96}mb?X8v zCP_PoCiS4kdzD{a2=$?HgG_pzAHIVY&Zv8szDt`{$SkrxEK{bJ8GgqjN6PHqyoNp- z{Sp3as7H85588}tk7d7M28a3`$+7e%$JCpibj)`n-j8PA7&Eh;of~Glb9&=yVmG{F z?X1)}Bz5Vf8{2=EGmSn{$;`!(m|I0uDaZuSZN@UDnlHjBY1nN&(`pxaQ}8HXcUv&w zJYKDCyS#^s|Ek!3wDW9gI(n$R@lbbKva&7IXWlB}y}99dCePL0R@C!_)0IH3O4T-c zNQFK9`J<`tye$&c;qV~+;o0G|O;PD-{P1ODWse($`%6}ldAq_*RD)M>lXAOazgW9* zQ}(@?)Zt>TSNvCcs&TD(Nkdi!5ij{$GWnLKZ~uG)a$rE9ZRhGmB)s#B(MPY+k_9VD zwy*X#WIfNhHBKz5({nAC$gtTmoFrJUncmkvhTS|IJZp%o1NUym6G@HtnnY%$Ol9rj zbY>N^vCbhakjs;*@QI2_DSh-^#dndLDIKF`pD<|KWj&=+@f+d_9_Y=s9-z5?&s{xh zMY*9>{;x$7rGCU41t-(j_CK&ThgFJ?$fn2?OSM~+6UTF2dN`MSN``y8v;o!;7GxEA@Ly0VAHu)QU( zzqTe{dm~HaJC>*ikFp>;q%`?*QLASka^#_1@;El@pnI0qxh6f#{g~I#XQMyDe+~5r z@94`fYlHPJui1zH>d~QhwF7T{Tthhd57T$()Uk>G)uZ;rI9FXC2huf6F75HUau&HE zGFYx~9r1g6P}AViF~S*IN#*qpq#o8vh0hLbq%U#}w*2mEOM@H}AH2z5L2}1^?s%&b zL}r?Q{!!`U&Wgs>K6~pij83P=I&#Na(>Ez$=@P%DafL=Y`Kb!7WaX`f%$=HBNKwe+ zCZFNqH0bjX>(?@KnP$+DWY^l|^s?tjzu8su=oZ#J_m?_6@1ED#B9-nwn;oW4g#L)> zK9UgrYpigRw7XD`#<%yrr0>{SB6rMP;gx(Sef`M#o~mZ2D6nbQ(ExLOQYR;KWA=!1 zR7^26@Zj}+^xFaJcW$jl#LIfUDvZ+>fZ`Byz9WsMuT-s6d5@59bBKcvtA$NIDd5}l&KVGr$Yl=WkF z4YQ6FyPgv5IXFLUNI^24eB*Q2)VXQY`9xsg>zqhB?HoIQ@5%*sQcbI$jqGHiZ1LOq z-26;do)|Le+Z7pRVY#d8WnCDv%6YP`2jBNCeNy}=XxtKJy1eQA(f1ci`t0~p)Z>{Q z#r_C$Kjt;`+31h(Uqd~@JGyvGcFYTvCiZIU4uy*}fPAu+-9Gf_20A(USWDBlJ8WH& z{p{T3WlDUn1 z638wOA~`YU)7vDzpZVf*K(W-9X{Se8EL|K<6V(ddj?4_CU+dT>(~+B4R&J88KN4L^ z(h=tV(pbZ-fofW9gjcJU! zb^NP7OSmt+z4x3#I-<^2!f@dvy0gmEwZHX9F4{}kX771p;;FIbZt!*u?w3(3=UzXA zs_gqUf2vaodl#nU6c;jvr8$oNbm_qdkT`y-*g@N#R@?|VPr{$DgH^4{tHpK_ zmkq?i5o)q4u{BqYz2;WZF>7+gihTX48ynO`$ zbkU1k>>XTGcV+|EuKX}jsoIxW*qv35I&aNVZ4`2<=2&yn@6>#-y1tN{t1Dc;eAY@* zb?-uF3N>Sz)ti5}_^qTn-k%JemJmoi?>U|-`LLC9Q{A6gD>I!u9b~?<+GQG-y>`9p zyRCuTgBL&Zw@m#~Vjy>0d++xiSOie=={PNqRVq;aGfvCrq1R$76Wg4%pX`ho_u|I zYC&dx4?4VGnbhk|h1B)a@h>JkU!YD`KK8P_9`}9C-f735Xp%jlZe}_DW9YYXebWmr zouYKZx@A|AQ`kn{$lkAP?F>%QvAoa@o@-8>Meh&L}?aG_QWW4SE&VDlf-15S4NskYtGR69y3rCIL z%x0Uga>bDWbnDKr2K~_v+{q)~@5D7bkoIx&KiLL{l7;J!EqvQFom$R#`{DAk#q^N# z+wBr7`Fi6cmv&Fej+!3q-e+ZRqEs_$u#LsJN=k`qMV<+jeGJp(Mct?@oHXQb+pS>dlPcKT;BT~GH0S)Hj=@{8%xOmPFc`uWl1 z2TRO3mt!8!0{t9F+TX)v2CY}vSnb@8OD6PWr+V#+SZk_F9VZ+(ec+=qSKt`7`-RAZ zoJy;@rWfqaU0s+E=Q%H%PHxR8Z*SCNkG|gts~u{}GB((Sd>JjGw^RXd#0cFk=M?YmGvEhVmmrS#Nr5^-+i+OEew_IekYv~y1FnWW8Z{qDmv_E%@J zxM>n4DhYWkYEAzi?p_DjFda)VyNo_uz1gtGGC4;m?jqP{V-Jb_5$1l(Yv{AlAK|}- zdW3hRacP#>pwTs~#qE&Bb0tIaYUE0#$NPe)xJz=n?#D%>>+Wb{m-uC*;9N}L*NJA- zOH=vY{@ET>%A#xTt7EpTV^iizvnfGbd2h?ymZb}c=02mIS0%%#jYsLWy)W0$M;*}~ zTdemokCf%Mw->vSr-4Q9VmchioTBhEGaY`H{Mn~NH)XrA%@d9r*MIe-_0dsVq_(Z2 zC!?Z`MkV^NcfCS|yNJmA>0V)<&1y^rkK^}{G|M}ZAoNG?cK`ps_SifvRlb)7*I<0P`yG9)l>9T)m9d_i!}4PIZ&~iF?2Y_+ zLG+!0)Zn;5Gl^04vey;m^GMs5s9!Ht)Y;9q+f*7K$5DN;vaj@)4xPdtG<;5*z@=*I zsP!!B&2>n$kN%YE#ER3_|Cnr+PZu3k_Mp5C*I1}Z z!b}~d3+(Uq)H|ovrqDYzU5b4R+_@e~qA8imr`S?@FW134iKgzB7xykcL3I*(wq1Xh zMvwo|`}|Blluk}Ob}nY-7d_`6V=Y#V<-hm;?rXSjI7U?;6T(y zS}k%jT+i)pn|k8R1Rqw{>}+X~7Qlw3Nt-t}+mpW@Dc6)+7m)!vKjrTwZX|my_6)tE zxP+wz(V5y``8!XBDDGR?3d;Q57O(rglq-I#GU&>1b1v}rdr6hi?wrD;Lnb zRCAAOsk7CI?;X$8x3aZ~q;6IBe@%Xin0ul)C9)}(Bpq$c$mq2WAyWfNneS4g%%(73 z;^O0yeKmRv`+3jR*ktKmIw4SPQ0=p+r1+u!6GQ!@|Q|iatFT8Jz>>biTf>EdQW=o67KyXzim^hyGpapFZ$xXN|E+WjK5jB zGMcqmjW)df(va9x8OCXdX>%>NXj*%jD!1q5TsrZB5gBmT=kahpzh~4j*SGqM9Jz$& z8n;tjLphJdmD6d49=XxiCi>;UVy4;{74go`iA4Q2?^!UMkY2YhZ4ir`M*Jmm!n7~C zaKFVY%2t2SBCpP=RrG#(fK|M)&1ntRAh^@wzJ|L9_Sx7&Vt<6WAM+aeZ1hL?uc02{ z9ZlFp+E(_v&*Ucun-yDFlGkL;wSj&8$*flG&+m$sFqL(02UpLVM1Q}%Em=3;n8w6t zJ0D(e#GQ;i5)m5^#`U+~Dj6{@likTUwBY=ZK-Q-p@05!2q4&Jf%43$Pa{kuuQY9lC z$mrPmm7h!qP-TQ2|ZIrmMhh$1+c728YQQuF~ubx%&B4xh5-SgUY?{8T`pPl8U{$k?q z>7=UH|Nm=#?<*^XdNiTgVf^D|TEuVt`SQm0d|EbV_>k0#?%ZV6A7x=N_GFjnrHSO( zel)qf@z9gG%Cz=iZ1kA;74${p(nZ1AjnAENYa&&9b16WQw`z0j9$ZE5v4 zllFqIZY<`;%cpQffMT@My4bf9AVdED{-3ASIPc@2Fw`Xl_;P>=AAEW+Kdwg$B_)yqn&qASeV&Qq&w zwQT%3x#54lK5TN~ij++KYO@2#=R^t4qyGbTSZ?Ww*~Js-Bb%}{p_iSx)=Be)e}qbtgipZ_5Mo(}7c|MW4}LDt&ypeR54Z(=>yKq}W{F zZ!r#JO^5iSm{guuy7XZ9ner^%EuW@#zDiqKV*0ny^Wx72z178s7Jt+pN-~PV{q>$G zaCyxogQwpt)YEuqf9ut3>OKMyhIs3 zFDNM+*s)fc_T z&SxH;x)v@vF*fjKr}-*6+iKa~Dz~qqvE$bkZ|SARjU72JtxH^)1WWgmt6A4vBCh_X zdF|@llFX)8{rt|)Cyo!oBkQ)zASdb_O(t|15Yr>F869CxoYED`lI==`Z0+k+m*-E^ zBhe>1d=00%=p8;h=T+rFHDVj&ARhZipY-nN#T)igOD=fst21>OMrSBRG{}y-Ujp3- z?)|va;=YEv2=>|7Lt=k~xgYZy`fT(^_^+WJ;T@S>EA098bqfoc6j~#DB#?x5?3pGJ z>qYGE6!-kq>`LB_`2Rb)Te4||PW*kA>b7QW-(*MPkgN7E@|6==C++?G?vpj#vqy&} zzdCP8-)H@Ky6BG+9eZu9S^cFoR6h5ao%ZkmPO7C|{(n53hd90+*C$@dE;B<{BI`H?p_jkWrs&DFlV;R{?kDu-rH??RfIX2yNZr;o`@rMkR z#YUk+xl6|a%GO=0u{<|6Y45?_4Q2L)^9;R*rLmb}wd?T(5}oFT&y6pBeMm>&(F3crPX||NrMYAakNo+>rM{evzoO1%rx$;+x7aZ?=8sLYY!f`o8?o7 z==MynHb0VlO4fauQmknE+28@^pN}W{o=VO{c-L?r;T-ATO`14sYZLR-)oZM~ z;K7pRwj5RX9maNP9aSGX!j(Lk;6A@?>0BarZ0GOrRrlGf=wTOw+HKgk#I+=Fe;<}( z`F`FR;{Y;9wP|q3TMzEs<%{O8OVWw)1-0~FG9tRu;hwcp$P`Nd{QB-cSBKqf-NSE+d&lUTps!t$I58x;h~=SveuYXJm7?WKYxzAS zoi}ZllVpEHvrO06c3o#cycRU_^BPrdi{k&;bfM3(>MQ>ykniO59Lc-J>_7S^h|d*> zHd()^ypb(OdY=6=Xq;UX>s_nR=X7X^WyKPuvcjGhE$c>HyXpNvhU{*$)Ckp>%I!)n z@0s-EBpW~%ZvV4tFzX*9<7WT4)l#QmL;V;3o8pkF%8Pq@JQBB5Z9jZ?tqCdhakkuY zd=C}x=^*n9h1!}%7?C~57#G3kDi}QFh5Se?+)d5Fd62v*i@$@Ae`yMD#e7z&tbb}ry4CP zv1i>TejYzod`t92^HbEDKCb`y*x)~cZv=WjbXw?Z&_%G%#vT&;Bh3An*YMBA{|N6I z?jxL|(MNV}Gdj@3T1W2cS9NS8Yiidy7SK3{^C|vvEa!(0vH1Tvdf7waTj1*ZtZ=D* zCy}Q+tzQ>zTgF_;?5=HNZ*E>o?p21I4owOnKl{(qny}HCKAgOu+IdDK^;{dDDw)@r z*OVn1KO6n1tXh)bWBX@uZ9DHj+A?ED7VjI;HvW5;*ZZd^Q72xf70cY?^)+_yU0U6o z*6ZSy(y!ws`|Pr_VCk)}wQS!UL7rJhCrh&;JQ!5K2%5G0zJD@nTO*Gy? zX;4rE;YpA@tDWm%x&!*Dsn7b4DT>uj~)-{OvpXOHKf zQze<^yRH}xRc7WjO6IK(QkkRYB>&8?va$z%^^Un-zHg~xHscw+e1LB6X3(`?LLZR<<7VlkbX@9nfW6&vJTR$e2#MB>KOP`?=bS zA#_H}vS&fA~EQ1W9w0nG4r%yWgatJ?C-f8#DAax%#u-=SP(8ITT1Y$U4tm^kxlR zZL}!m*M(UuEq1@)hm>#h5qxaA>D7Y&NLl}^;2XKoE+^>y8^sk*Bs#67SzP!#$LdP4 z(O*FqDdR469wgakv&?MSRn=M=T;^kYet*PL^?docpSjN*(=f!sgsXb6f&XW-UmoYY zkKddx>UHzXt(H|+#p}=Z6Yit&2Fd>XIl85SCzq{gZin0<`qI!t3zK33?JN~8T{tvS@f0gMy?5!@ob4<>;olenBC3EU3*Sc{lYV)_v7q*MFsR(3P|&sPgSBz$b}AHg>Qy&pR5fBM@0brI~dv4_O|2y;K? zHT<*jKf=3)`v~V~@nloQ7l)s)%r52LW!3@oTtr5xSN38~?ZCK4f9>N*;gw_uBNGpD zT&+6K-+c}93)Oz`iLR!q*$(uiKo>aCHrhvJL2AiWs_{Vh)HaV=`)=MYthPXuL{PP6WV7mN`o{B}`Xb%2t_-f&YgpUpWBlt$3_d}R^l-|a2y(){ zd3#Z61gBIIQayHE5bco{BUjtx#(w*MzJ5?Yo}FL!)K$KpDfeQ9=P;Kxcdq&8dcS6^ zIPPD0+~U+KXU^xA+o8ZQZQ{ruz0|CXKZy4Clao(zH|Ab#QPm7qoyonKpS3>hy5yhw ztKy(*CE>2-2zozL+FCkgV}&+3{55USUk?X~Us%vZ*s)yK7dxUP`?IjYQ-6FnB(rUW z{Snh1{PFCOu?x73$8Nat^V+-{%ECX}-gs~L{=KH;&gRp}bhjP3l=f9rI>M0T7kRDb z&rzReGP0|llv8J)dpd5joymeZUiJoC%s5BgGp!!p7Nq@$ZCjPI0sWL+;vbUPkG^@E z^U2A!ki9ea{294#GNG3$79W>$TKFO1V}t()z7gpC&}pHs zK^MV38+%CXk1+RRUc)~d|0BF>xQ}p-)EhM7^9DR+EvY+S?ZrWiBh{P)=`E#ehDsbo8bxPtG=Z za}#&uJe)K_OfPlx`P#N%1xqWpEa^FYFDvcVBfUURC=gUcg%w% z%%9Hs=uytQ)+_N~^a$E($iGfL@`dcti(HojhsLry&Q$YhYajY{r03$8GDgSVh+nwJ z^#s*_nJxY$yNSvSdiuj?Q8Bx`(yZF+oEg{MIJs$h|8h3jLaYArn0=CBhsl-tKPiot83|VCnCFgT)W6(+N+QR%^Zf@5%MDN`NN+LUoHHQ@Ug*v1m6hs ze(1E&*Px4FpN%~v_D7ieF|XmDjsFqeHQYxyM@FI*<+&5;n77)k!h{VAIom;TzXK+D z607=+`HzjQN$c&-mOmPovL~HSTo}G1j<&arS^M^CBJEPTOnqRqsd5qGN$7&V6DiktP14ckg~=Iev_Rv74)8&dZGtxP3H(YU){i zf4*}7)taqY`+H9%Yl@UO$A%WOGQGdbqdYQL_Tsq{iu88VQ}MPt^5a(# zQ>)9^IitmmIdsTg>7KKcd7QdXoAYlztN50iGC;+er4$uE*%&I%b-vZrKdUy1BEN>* z5%MDc`TYOy&xWrSen|M(;6H+I1bRPoTIg%gMX=At9uoT_%>9_x@XyBo2=5y1Bb=ih zmgBN^E_%hP?{_}n(`!Cm^7Zc}<63ii)ON>wkvM?Vj;!msFe-rCZ`U&I*b9HcrB8M9 z)(zx_J-j?J@qQoi=luASx*xqnu2Uz@IKw>>|2lUtMd|r45;IO)R@vN&yj;-r?C#PTQ-H_`yPM%d(_v94)}bOwHW?b{O|7O zSjj(|UC`{t|Bu+osCbLnnP%M3UpDHcCH7p!>fe65Q@@M!m0uLS%JgF6O$=G2#%0Uz z^83QFL;6r1f4kmp1sBB@afgPvU6CgTmK4bRncXhhIo)ITP`7Mx{yyW=1INe{+2#52 zOIOY$_NwcQ#%4HkcaPO=j1FE%3hf(wE+vau`TNmPmrC7;-eAoZ-GQgYU(^(}H#Uso zDvOmh>>b=Wvr)lGhjSi?1GHLW?sYL>T06s?-)F!1F9(VI8gfU-i@@g(e>Qxz@I%7K z2LBO!BhdSy(?VZ^E`ohF_K?^gVeZGghJQBxM|jt8AK@G=*7GXsC-;h3{m3nH(0XCX zCF#v<*)yM0{%KWWUF=U5pF6X;leLH|8})PPV3|2wC*PbGmxkMlH_Z99-Kg=p_*ufQ z0h1atMa#S1vzO5|CpTqhJ@d7kO1jy!PRm#C%LTrdQ{#QL`z;rAIRAbtGCq{j?Zn5v zT#e%_SKfbQuvppq%UKt$Q}W53=XO+xHKv^sblOFxWyvO8{5a9w7+x1)Q7L;0B>Qa3 zQwOeWwo&~nCW{h}96YN@Y|lD8Kltu}s6PE8Kd&+0$qw@*|7&wRH?K_I zYu}5zVBffL(AA&fuDPc7_;a*-^o=S}RJ(aS({1`ulCKKimh*m+FZ6x_Zm8cdz;|7`q^@UG!L!a1tdw~i|N^OBL4kWJ}p;>hMT-52-U5>0fTPc@qtwT!bd_J6p} zeJ<(AyxcG7?GSUJ`PO;jG|{o(?hA9|#&af#125!A`si)`UHI4ZW+Dk49`d)p_%wE3r?D2b*)@+D?z%50552m&^(@PcY|FJhIj*FLL4(*WQ~p=zs$$ z`*25l(^O-wl??7a$8IwXN*G6bDf^KXn+DC2aeXRoyfd{=O5{&*SgQSFt;rL(nv{|! zEVT>g;$u_W;N;GVx(u0Z{91`1myJ9va*)WcA$Nql2z>tVXTw(uKO}r?@E^f90=*wP zE%Y_$BG_kR4~hK|=6=j;_-EsPgm(@15zdj`aNXE4W(rR?VeP zZk3&Rr#_eMXq~|4vf1}cLq1Q-ypE0KbC7K32_e76hMNhwBeto$$4oviGVh@FbK@g= z+zYksfl-{D`wwJ296_vf%vdw=e0P8~n5 zu^6Wd>m>hd<`eiqUGhIFn=$D0kVO@8#B=K2PohtQiCd3N3L8_bIDNgZGFv^1==c>b z>)Le`S;h5}znk+k*w~N&m7POcEf*Q2m+x^t#tvQcF@B)Cm0d1~`E|E)Epywtr&#t+ z9*wl^mD{{?E;oMFC%q-Tx6>)1I^)wnnX;)B2TyG89>es_b4nVAM>CFD?q7d2jFGd6 z`RqrtxWyo5SyGop6#0JSvXQ4n4ifn_dz;|7`q^@UG!L!Z~_qo%!Tg-&gEJQi|?aeK%@y*gi*Pl&Z*r>v# zcg>$TV4)s~XnrLt(brf=dp}+mp-+2k&Xw%5%eww+ZlCsfJa=OF=<#Q-DsV^IQtnCS ze%jsTFF&tQ1CPa~l7BXRy(RHTNxLkU^>M_JnhIMoc}Q$C7uzmQN*I{VpQ9&x7ft)S z;so08M;OT9& zThH#NJIQ!(wl6%bw{|O~EAMAjebKU}*?uzWFSs~*%D3O=l zHy(K}IzHpAl6jwK22KR>{m5k_Pm3HR@@vQ)Auj@-Km6J7)xr-69~=Bf@Qpz4hfWK9 z4Y~;S+1Nv3e}uUo^BVry_#fe2!+nHvv^?T@MUP>xSRd^ZM|>ZJQN7?dJSW1^O;wN2 z_gfBgn#||2Y0|B9K2J+SuQeA-a**`E9>8ACl=8^xckzO-*|e zXWgy}u9Z`!_are}zWj_RXUewQsXmf4^#6Gy z;6xzbk6bqLw8%jszlPiq@*?p0!=DXbE&P!1vB7@?-w5=6=(Nz+po?IijXfmxN0|FD zui>AK{}J9b+($S^A5FCOEr@)@_B!eCyb+5c_l(~XPK4$0wx^LfpUTR@b!vM`a@lN3 zbv2)-W!Fc}5^|8ow#*R^&3h&)j19}?b4QosRNwP?k$CN5v;Mg{-1^x%fe|_@8w>0o6GGB_c>bLYd>qea!>VmpUzb6v20$ksVVg_ zXk701eG=OF*_JZ3SucA(}JCb=$k!9%er*}U~@=RX+pH(Gpu_MVJ(gaRX z_i3VXo>O#CZKl8}3J*6uX?tS;Io_<-f0*+S?#}i{0;lMz{@}Abr>L~JN#GQv#*Fp- z_w}=A{rzJhENrT-j={~ipqN3cI7!mmJY|H z{2enjDStRnz5BgMdNaM|aidgBaeLSa#B~7s1 ze7#( z`_}TDB04)ij^`Aym&^O{oFeve?L?kaL}eaG`8ykSN%=dIo=W*U7jFps9qMOyVC+S$ zFtK-EEuO!_4)mAueV)Zi`924Yqas0-rzp+3?lE4+$R|{73MOK<|f63w;f`2=>|7Lt=k~xgYZy z{@M5+;a$UhgmcumSjs6n9Vq1#-8(1c6t#Noyd&WhF}o4dSBKfJA%o6Z^PD0UTCVe0 z!YQI#t4yyxm*i9L^b$BlEXX*V=M>SVxLZBWJqaM=MnB*=MXcy)ef8-HLEOvCL8BKp zCz7Qx1p=q&+qv-qr)W@bk-#a6Xzp@Pu+LtkWMkgktqwDqW4Oc0;g!db%qho zDe5}w;^QtpQRHH})t~M)t4P{qizNZOl1T4~Wjv>dj=NyNbBb8sTT6IO5&If5kLMK8 zR|oqE{2dw~B2G-{*GU{Q}?TWTlkv zn=lif#YX|koTQZ;6H3@tl3LYCcTHqgnTLaz*I1$MABbSXlEpm{^uOWAYya;^$ z@MptU3qK@$Z15k!Hv+vMIxX}y=pxu>V-Jb_5$1l(Yxrm5e}s1p_YuyK^i7z6ugt?6wv{35+3B`$>d^i&r>apJ|r-=`3%n~O0Ufv$8(Ab&83{8N!u@} z|J7ZIPc@6(;{EzUi;XcAS(to#G;1n6^<>p8@MYPcN@2AW+L1b^JCeJBi zdUCNmr-+uXeV8w6mO$3`Ue9xiXiHr#&ncq*W$H)Pn7R@jotr$Th-%lQr9J!M#P$EO zaqX#+wdCr=T7gq^bnER{o>TO1%{PHlG;hVetR&G~PJLWW>D-U2NYQ{cfm7t479wzp zUXGY1aEe?tZdIlkO(WmR%4D13R+8O6wkF?P89|1v9m{iy=+Z7KJg11wNcg~WikNb~ z9M37D!QLqXe`nw@JAuD*bzg?S-|2VyCePoYjcpAL=}vOot>344{ti1 z-^ar>N8tO|mI-_xRPH@^q-IGI;{dA<(?9}-+Y@Yul70{;ly8t_KIi9o&|xoqTV zk%L5j4Y?!aMd0&?KO4SU_#xqAgZ~J=5$OHUX`!z{7r{Oodr0h$F!y6#!#^AUBfM+4 zk8qBOxOvgC*R8jxz$MXzxrZr|48B8*u!z zB^SAIgup3UVREHk{nbsx-h6`Izc0xoN&jDrgi|E>zh?2ABGw|)5+&gjv6@+l_qEff za)TOobyhtsc}HnD&naT*O0#%Q5lh?hg69;`=`74K>FXR$_27WF!H0sm>5l_sZEIY( zv1UyIrzro^0fAGrSf@bX6g8R|)$*L8U+K9#e}{e8Jc8%%Fzu(dJb#CUsMZMloss+M zbCmV3i@%Mj68JkM>zDIAK{}J9b+($S^U$#p*MV&(CmX|8plCx_@yh*H>%?-aG zRlMg)8UBW41#*T74r-)s7*>)$_S)Z))PQCrLc^=UiHRDoH zr4Pvu@)I~kRu?UL@|+@{-K~`86usQ&D0|?%Ke=vjKXpS_7w*p^L+$7~5jUcBcwE+^ z5aQL6)A;W2X7THox=G5{G`LRYqsMl>DADPjIgM*jelFT|Z@a+XDSK|T_=I+cxFu6& z)G197vFZ16b@4(KQaNK_&{fAsvT4$S+7-?VxL#Ag34EW^YIzIKa#J|)YQY%=9}-+Y z@Yul70{;ly8t_KIi9o&|xoqTVk%L5j4Y?!aMd0&?KO4SU_#xqAgZ~J=5$OHUX`!z{ z7r{Oodr0h$F!y6#!#^AUBfM+4k8qAgckvN8MdCK;eU!RJ!6<)UBssGElfWs`Op@M5 zjlpt>Ls}#pHN$}dr^qLJRQJ$78%UAeA>)ySNo38$xjd(cJzXw7BjFUW3r?;)r-Um^!oHYFP>9$@I#EiDH?5UDR7DgPLJjJJM{bxfxkobuRQ1ZJ9I|* zIf1`3u4Sd>jITAK#O4-(zw_7L1-?)3z?B(YR!-;U zC;b)pKHwgMR}0Q4_>kcGfyV}p7WhZt)_^wxP6YD($Ymo>iyS2KYseiTF9M%G{Mqo; z!Vd`_8~jJ`jX>{*P78evx(N2!*h6A}gt;H{8vfb%AK_iYeS~uqyjRL8difxJm|bNQ zmy_{nEzc>+`7Y%YDU@s(AR6ly^MHG2CXuX^6;3XvDS_{u9VzO>A zJg107`S(63;S|xF_ad8&SkLw+3Z zslJq}o8xzX<=VC6>j~GM?xA(75m# zf$w8+`qAH4yGE0K?%@L82mEYskHM=2XB2!$aQ(nz14j$|BXDcL8v!Q*`F`ZGk*7rt z68Sacj*u6D&maD5_-f&YgpUpWBlt$3_d}D{zW#Hw7KJK6)`Zs#U+b++!{`;nGZjQ}p5O{f8>I7n3)0%6CdQMNBzjH_s_z za?AJJnDpI?c-_5QpW?TMR5$M3QFK3%q(3~ybBgGB88@C&M7h*07rl*_bA`j-C>5jfJONP$~;s*Awi(YgF#zSH5) z;O{|MX~@J7IiK)xTjY~*Q?gG7D}xg+F7;PZz+8@^ijA>m_# z{|LSj=>5=Xp|3#~!9E*%NbHX=_hVkeKO6reylc3RaE>ltwG%i+)Bk@T>8dVS)Bi${ zqz3Y%z$w}vU%bg(W(E0sCrW;O`g(GeJ#Lh6is)|5Jf(E!=M=G$Gc7I6!zU1Tr_hZh7glrK&4=!Nd2TbQ zK2c-ObBYQJ|9#*&MR_Ny1x``-?AARzr^x%+37)@0Us?6szN`*V?FrTm@kF9p62i*Z}S^L<#-zfzv>!+OVC3w)oNrK&!fHeRHnL(2C7&mSCi z@Uy`^2Co*JQSc$b^#hL$94+vVz^wsq1e^%u`;p5=o)$Ss|7Lt=k~xgYZy{@M5+;a$Uhgmbh**+SqHHU1sha;?Ch zsP>B!I7JTidoy(Y`H&H^C4>9t1rYB!l{}}2U3DMLbBfr%MvsXSP7#e*=e}Dfz@M{i zO{0(B8gl1H4B$CM?6UrxL@A@WrfdiK#yHdBr(2%=6 zMMvNinOC?8oT4-(Z-G-3xi?PW6y@x%;rTo4?|dzuzr*@Z?!ohS*w$XA0)I!X?~u~` zk;a_cu#p0PC&EeK`_P#a1-=grTqp2-sCY+?!1p=2dCjmzB#>+=A0hC4&cKRE2* zXM=kTUM)DI;6sAz2Ob+ZTHqgnTLaz*I1$MABbSXlEpm{^uOWAYya;^$@MptU3qK@$ zZ15k!Hv+vMIxX}y=pxu>V-Jb_5$1l(Yxrm5e}s1p_Yuxf$C1?nr)WXFBVB&pg*fw4bnH1rhs zK2r=LM*YrqCg>MIF9dl0;IMiXEPw>(kLEc=wCzfrf`n7V>@yp9P7w>Ueo!#( zfdkjBR&CVv(PYm5dJNAgqH(@yJg11h>UM_b6tQ8IsRdEKlet{=&hwrQj^G*}HMZK- z#1a?Z;c+~t==W;xZ9J#QU}~wrDf*RGA#jSU%&znN9eU07D9_)aI^ryzze9}%OZhu` z(>{bBy{XI1-1%7G@0hMh;Q2mG%kUY`_hHG~cGpVyKJ0G#34!lZ9zF7xp{F5fvFIo8 zeb7^bei8ISfaebmJNVh)9)nj4&M5eh;QE2b296f^N8r|gHv&!s^8LtVBTtJQB=T#> z9U(6QpFjNB@YTW(2_GB$NAQh6?}ttceGR$@_Sx7&Vt<6WAM+ai+4vvfUBi8Zb975l z$|)Lix39n{QrDR}J~(<6vEQ|Exz2Pua;{+6s?th-lIt(!6b+>L0;fn;BX(Fsh9{}Z zkNOpRbp|QXzP7)+LVxbrtnV2TP7%G<{U*;TVt;lSxJo!htZM3WvlpXRb01U}lsG9b zCJ!?@g(ycjb0f!%3YWj@%EcY+_2tT#ZQ?hV6GQigDRH-Z?woY7bsTqRoW;vJi$>9m zyIXkv4twhpoY@xmPWcv4_O|2y;K?HT<*j zKf=3)`v~WV&|-m8RD0-^$(Ii+$@|080uSD^;D)dL{=Q<58#lm8dXB8)Vg*jolh8}P zo--Gdjx*6RPh4EMd3qODG_IJ=HEyrqIYn&!Vt1ZX#QIEg<2gm__KO&|e+Ruu=qQbV z17{YHkWDonI%bQx2J4D`$v5M;S(UBJ;(T&MRXwLDUD~8hB&>zOuFX?P?~(uNZ65y; zdj$#n9s1+_m}5R(KSawM7ASD4BRGxWKlJXu4<$D(2P|#8=}5R&@$rjJ`jgu=wLIU4 zd9AkZA~$gkL2nlNj?hzsei8ISfaebmJNVh)9)nj4&M5eh;QE2b296f^N8r|gHv&!s z^8LtVBTtJQB=T#>9U(6QpFjNB@YTW(2_GB$NAQh6?}ttceGR$@_Sx7&Vt<6WAM+ai z+4vvfUBi8Zb9ChDCV^A*a6m7CQ{-r~N8l62$upDe^b% zCvb{7zfiQ~IYmo+%C?D4MQ~T%i~N^PoXK^_8qA&gb~U;5TV=C^Q^Yun^_CJ&5i>dY zi02eh@_7f(DPl{4=L`IutWqNIcTBJA3jCcqb%DP_w+$H;apcoo@xxmmdHxP7Z4?W9 zpX}=2Mm*o=@{V$W@AFkOk>~r+!WUh$^&2%gvz0G+z7IqH8hW$PcZ8lA^oyVu0z7|k z*ul>R_ZYlda7Mw01lJEdHgL4SKLWP~yb*9Bkncw>8+lsfAdz1~?g)7i`269|{?AwY ze?KIAZ15k!Hv+vMIxX}y=pxu>V-Jb_5$1l(YxrmX_dojoyN3G+=g2-ZU*Htwr{?zN zIYk4yEM4f4VNZHYetRoCa4t9X|L3UtS1G4xY07kgQ$zwoz7MsRe0 zyKIu1isJUGe!d6r>{i^&-bApwcZuCm3tF4 z|1_TOL($`h{x$Swq3;MiHRu;XF9dl0;IMQkfqw*U4R|Bq zL?GXfTsHEw$U!2%hTIYIBJlbD=gN3S3LJS(5o@TM!gGq4!n)#A38#oY(flaz zcRs8$5%@bH%TEaWoo{`Tc>WG8-EP|2sXk6Dvv4=h-(iQgl?i+wy@-PX-=|mHV}b8; zciA+a@590ZtJQP{xDs>20-oIPc@6(;{EzUi;XcASI($;fDax2$C~%5Qdkig~HaLttXnD9* zGc}lmHfpKfsJG&Ru0G;9MRa6_5zi@NeO9z)OE^XBz-(?&ugs;~jmm*O+$n1k6w_{8 zdCHv>nAoZeeqlp2mS%(nt9%pvji0|+MR5%0X)hZF2v78AEPE%*cu9QFAzzAnmZA_#X&sZhl-{jIacL7 zgF~+<`XJHchyFG6W})v0JvHbTK`#V&{@}2KpAGIYc(ve+f)5FAK{}J9b+($S^zSEZooFd)P_oCZ-ts!wCm31$7FXMLX)>_kLx`_08mCAF9nEH#5 zt`bfWwfgyy=M>Rkzp!Zkt4qkWH|#~~x5*?SB`bO8>IBaI+WjoefMwjd)j_Y%H}~iK z3$OM_JUM|}xR!4GT9QwdKNN0pZ1;Gw>iHd>ze8)>&4${)kRfyP#^@UMbs>p9uBRQp z`I9?O1|BwC6iFTp-r8x!`Y2MZGmz)|urF7q73SB>Bj~3^uPFK;(c_2yHS}ho?+86L z=odjR1bF`7u!Els?lE|^;EaL~39cV_Y~W~te*|s~cq8CMAm5K%HuAK{K_b6~+!693 z@cF}^4PPz%knpj=e+1tM^nU2H(AS`gV4saWB=$#``!TQKpN;PQDwRNJg0~?>nI8Qo!WQ{fxqMYwwu7;QA^y$^LOaW>cBUxQ4>hjg)TgQhc(Wf zA@F^?iaZ3qPi&_Yf$!t;as0-rzp+3?lE4+$R|{73MOK<|f63w;f`2=>|7Lt=k~xgYZy{@M5+;a$Uhgmctw z=@Nlcv`77QFP>90t@FkUAsNBsbawJ>lWkF?N_C=Qyv-!;;Ol-or-(f}+MVYV(ZVn* zo>N3+7sEo*#`(ByBom;qx=lf9fWuxa9{j}&6MIR)3{LsIK z-YoPTp{EA@BItzx&mSCi@Uy`^2Co*JQSc$b^#hL$94+vVz^wsq1e^%u`;p5=o)$Ss zLgW{bd*LC6hT0>l0-vorrPO;=y<4Dw0WCRC!JjJv=R&=M=FmTa9>55!+(7 zkLMK8I@Mf(ztga>i@@K}JzOmCca~dk;Q2fBgjT|jtm`+$yOO%|{2k`JYnZ_Id9tEO z;QM?i|03{xHm3RUd>^)HU6-CUR};9&8oPPE4@K`f`m)h;jDA}5ilPq^J$~q4LvI%P zj?hzsei8ISfaebmJNVh)9)nj4&M5eh;QE2b296f^N8r|gHv&!s^8LtVBTtJQB=T#> z9U(6QpFjNB@YTW(2_GB$NAQh6?}ttceGR$@_Sx7&Vt<6WAM+ai+4vvfUBi8Zb430a z2%Ms6(QW&APSFRCtpcZLz~fnxQtN2$+nl&7CyQrrCP^LgBGbiW^M-^^wS}`uru>Xy z4TpD#duHUcSwB|hrUhyYdUeE&ix?7e`JK~4(aCi!0)OXW?4j_nJ6puMg=x;8{M9U(6QpFjNB@YTW(2_GB$NAQh6?}ttceGR$@_Sx7&Vt<6W zAM+ai+4vvfUBi8Zb2R^!lvC6@&)^8pDTALQ494T=el+Uf3N34T4 zBn}ltkoq!{_gj)Ki)5=mIaIzBk^F%Zx4xWiLuT0A-+wEqLp<{CJb}OCFas0-rzp+3?lE4+$R|{73MO zK<|f63w;f`2=>|7Lt=k~xgYZy{@M5+;a$UhgmZLsppC#OY78DWD0WB$ag2Qa<@4A` z?ns~6I`P@@+$J*g-`)Bcu4c-aigfux(T7)YX7luiamMmbT~;(zi#`6B867zJORTZe zO5pFD$rzU0Z(@zeFXiIHXG3O?ZV}qOs_K^zHJPuA8kabb%^~tZXI!GmnachG-^X`w z7h>k^Olo_Y@Z1`Dv}rugt)ZUd^m%R#MejQLve9#lep>X3q7M>1e&}CAZx;HF&{Knc z5%fZU=MN4$_}SndgI5dADEN@z`hmv=ju!Yw;MRaQ0!{?-{m5k_Pm3HR@@vQ)Auj@- zKm6J7)xr-69~=Bf@Qpz4hfWK94Y~;S+1Nv3e}uUo^BVry_#fe2!+nHv)aj>`Q)Cl2 zPT&+x8FEeF6rGGXFK~*6tBm6LJG9W`0MFl{M=uNf9eONRB=C2lWSi^OTyGcmpBpLg zcg8iP@_Zla5$DbGeQ1pLa-Q!)2QJAM_&&vM=S~lJ>O>4azZUpD-8Y>1EaBFa?fA2x zpM+auS-rPaZd27~OLF&By@XrC(7TSlZ1fzXpBBBM=z~O$ANtqOn}xn3^wgkV1icX8 z`Gdm_em1zr;MIaN3O*#be&DfzqXqsExHaI7fD?gyKXTc~(;^3n{2Fpc$cw<|4}Ug% zweUm2#|Hlqd?V2Nq0>TNgD!%7HujL%A7SptyoP@^{zrJ%a3A3u%{0jpI7Jt3*a@7X zpNfV8rzpB<8_(Zi?{~DRN%%W#fZCT934cezt8EbYI|2DK^%R%1ik*xb1^$lqz|K71 zhl$=9@_Zk*HaUgo`!Gd?Cj#GRQM>HY2Q`%2UOPkJ`@H=AlIPZxbrqZP+?ujgAMAN< z4IAF|0?(~sB@sPJB;1<+de_mHjhe{k5r z&j$AxyjpNZ!G{Fb4?H$-w7@?Cw+6fsa3YZJM=l$ATI3*+UqkK)c@g;h;m?Mz7Jf+h z*x)~cZv=WjbXw?Z&_%G%#vT&;Bh3An*YMBA{|N6I?jxL|QD3E;qFISY1Wu9Hx)z?l zL%SDb^86iImo{pVgulZ|<81`~&brC^*NbL97WYU@75F>;eFVM_8>BAqeb@`@13ce{ zS#DV(@O?UV#mBx~J&T)Y{9fSuEE+J6=hj@<@u*9Ogj-XVq5eBY!mY6^tWr{taBFDH z*(W@=hF+@l6u33$T}NLwdXCXgi(XOmL88YG{cGsWLf_GUJvIOD7eOxsc>dt9gP#rV zF?hA$jDimdt{-@8;Anw=1a1v@Bj7|J-;Z22^0df7BEN>*5%MDN`NN+LUoHHQ@Ug*v z1m6hse(1E&*Px4FpN%~v_D7ieF|XmDjsFqeHQYxyM}{&|PEqT>DxSZ?{MI$^knnd{ zRl-l6ze5dWue|2@J0temPA{DQP;{-MQsD2*t-Qwbedv#Z;XL1mzBnx6`95^&SSjB} z|6h}RrNkFwZRRcTeM(cLJeERPDMv%E?4!UxFpHPw*1x+C5%R{9UqD8hTbgn9igWN{UYdv0M8#BcJQ;oJqE89oKf&0!Sw@= z4IC}-kHD<~Zv>nOK7peV>L}$OtSCM%>;r~!rYH}4gYNXkMOSHKEgR_jEQ~}id(jaJu#gtN?hm7D*D)D z_HDW^sylwdc8#|etvM~_Lq$+2*XM-2l*eKkoh5KI9;jyv{Dap!yYabocH(x!1f{MH zB=BG~pA)5S=Ei)!kD+%Rec9+aMn5fjMbQU|9zXQ2p*IVCN9d_RzX*CE!1D)(9sF!? zkHM=2XB2!$aQ(nz14j$|BXDcL8v!Q*`F`ZGk*7rt68Sacj*u6D&maD5_-f&YgpUpW zBlt$3_d}1e*)|gZ=k*%SO*J`f1TCiatp6_@RFdy;jxeiI9lKzfm;LK2sjbQ_am2$JS}pN$gd%HguDoR{_tnRR|`KRd~EO^ z!8ZcEA381cHRvMPXJZeE{SoGV%xm~(<9~#A4fhewk;l&Kb_w$zvq$Z>N~%aGciuT- z!oj>;x?z|x|E^iO*ZcAJ5skQ~%AX_ZSY6Om*0n$V794r`{Hz5cgEMP>r2ZMmY}?Ee zs$IW}E$u~nmcB3|c3J1UI;`|2@~_8enZBIK%^wt(KVg(H>G(5kf_Y?{_(s!Lfq!sV z*Qn=AWj$j4?sn|)aWlEqQ;!Qd(cQ8J0hU}aLGL>Hve9#lep>X3q7M>1e&}CAZx;HF z&{Knc5%fZU=MN4$_}SndgI5dADEN@z`hmv=ju!Yw;MRaQ0!{?-{m5k_Pm3HR@@vQ) zAuj@-Km6J7)xr-69~=Bf@Qpz4hfWK94Y~;S+1Nv3e}uUo^BVry_#fe2!+nHvG{qxN zcHo8@Hu$Lst>tv@8Qn^u@6O|cmDdhXmyNQxz@I%7K2LBO!BhdSy(?VZ^E`ohF_K?^gVeZGghJQBxM|jt8AK@H5 zIq4embbk#iE)C-U*<~Z@j`RN`X1+q6f7e)Xfb>3^8Lhp)Vppb^J^fg)Ts4VR3!wx>*&iy&oTOG(JP8RNc8xje+|7^=sQAB z4f;jU3jv-#IPBnOgL@2KEjXj#LxSrE9ve7X;2(io1KtQY5ytVXTw(uKO}r?@E^f90=*wPE%Y_$BG_kR4~hK|=6=j;_-EsPgm(@15zf(JRp~tF z@kTliUOFP32m2hH$j^gp{WG(!#Z#im#v`Nnd60(LN;$8^d!>BNZkbZ9qE*WlfrmGx ztF6E>o4jQd&!3`izBU(BzEvi!o&xWM6(|Ut4%#WiDtd72Xf7gI$i=fg$?ANbnRdB- zVQHtNab#}n5k5c2(7TSlZ1fzXpBBBM=z~O$ANtqOn}xn3^wgkV1icX8`Gdm_em1zr z;MIaN3O*#be&DfzqXqsExHaI7fD?gyKXTc~(;^3n{2Fpc$cw<|4}Ug%weUm2#|Hlq zd?V2Nq0>TNgD!%7HujL%A7SptyoP@^{zrJ%a3A3unJtPD=0VHey23n|w5*n&2U+pQ zb93eNS99)%V)=QHo-&Z~g8f^ioY&ZeQa-1hij=FE-MCHQ;cf2iz;n!~`cIcJaf4ln z*YRgOH;8#pIm`22=+m-kL}8FSw|QJwp0B`6vt{{QJdNyne2wwBAZ~w`Y(9s|X2s6w zoc71+zutB9Wuxa9{j}&6MIR)3{LsIK-YoPTp{EA@BItzx&mSCi@Uy`^2Co*JQSc$b z^#hL$94+vVz^wsq1e^%u`;p5=o)$Ss(|iSi)3NK1NZ>1UbaE~C|7b+^woCKOazDFh9x~h`ex82t{yvk( zBJ{4KFB?6_=%+=mDEc7Ldt9gP#rVF?hA$jDimdt{-@8 z;Anw=1a1v@Bj7|J-;Z22^0df7BEN>*5%MDN`NN+LUoHHQ@Ug*v1m6hse(1E&*Px4F zpN%~v_D7ieF|XmDjsFqeHQYxyMzl`bE$S0iHiN?BHjE zdkkJJIHTZ0g6jt!8#r3vAAwr~-Uv7m$oC_cjXW)KkjSqgcZ9qMeE#rf!&eJGBz$b} zAHg>Qy&pO)^fl-r*k@x8iTx4ge#~q5XXAf_cMbOu&e4LpTwxwmEqyZY(4HXjP)?rb z;9E96*W~%-R8K3K=T6fiS%DW!m#3ZJIj{6U!d-#SX_ggo$mq;-k^H(Z0uL{F)Nz4h zc3{VEJEsw9b@XMU=Q!d2{j}&6MIR)3{LsIK-YoPTp{EA@BItzx&mSCi@Uy`^ z2Co*JQSc$b^#hL$94+vVz^wsq1e^%u`;p5=o)$SsXbhkpxr*u zR@v&)H_f~op?$2PKONAo_xVnrhHCZTY@4V}o2+dBc|bpr7{708x6xrqovwZ>+)T=H zl0LYWZKLz(H+R=piS*M%gDlzOTS%8PKN=dJC(%Rq`vkqPkECJyPXDgFxSAyE9q`Jvy9B|ssChrOEmXH<^Gb`$DwS@{s8w|Yl5h%?umz+Ef=s|55hJt+~-HO zEVFvQvN(`?E$cYG=!-LZ8a?-J?S%Ps+x_uL!e8-s)<*G2lI|a7fvfo`r zn+}cT4z;JrWm-714y#=@r|KuroRc$t6tqvLi9w3##@(G+%EWJBZnHmFk%vj!PS&)5eKxz+9?#9rw~ z^K522()ISY{FF<~xkImZY8A@uW8Ym?`F-)2L#z6^_K&*jM|_hfHhU;uWgUsbzB@Am zZV6pky6(LT>6Bp+eabh8ao*t#V|x+yBkI0!mk{TJ}xo=})m-)V2;g*J# z>o%RF#htwu>g@_(FPj&fb$_o*b4|qdY6T{w|ERF%G|`khcU&X%;9y%4nN!%?sl|;> zQ3@B`m77AfyIoW$WVn{<8($I)lH_yYO}+S&3H`66u@^Vs)G zzuRm^E;NUv;5?c`+pl9F}J z+v4Y}V#~JF4eF`5l{@w2c)+=ab*%c(oEc|m1Z!OUprAA^i#suCp4OhVtC@1nxS?HN zdNBVfbCwUh70X>3|Mu3Et;^V~qxR$&|8sR=U{)sUeYIKMS38)l zo^3FqBV#pNF)l!BzN-W2xz%jq-e@mwgN^DTOUE^=$LLSnT-+U5`Q4dU{CYdGXzwAZ z+81UL^QoSri{6KE$}bl++p>jR(d?&|Q!^uI=h|C=NpCz^eXOS9aW6mSoFY1uYQhaG&$0}7RKhyTo;MHaWY4_yhAZXIoXJH! z&%1fxbu~+}_ieS_rblKf{AwO>XEbNS47YBSiJ(*4Z>?23U_=i^oHk#n(t~w9a=Yv3 z8w0r!8wVYYE%GL=8(vtQ(7r%t79ThHw%v_Rjc$zy{5X!C?^&=;J-Q#!8ZYzQf0a5Z zIMaB#lhb4_;$7aS71uYge-_0{0%Ofsm%((6OG$fKu*;HT+^hcdsoAWFt9uqvt+dnA zwR=}n59PyRg;#gzkGl9i4PRMXn)?X;2_T}bYcC#GW`6VlSde|mX zi`ObGJ_ToJ+NvED0k68Uqz@~j&o>y5>fgHS25SW|+($S^53URnJvIA9Ki;}kt#RNR z?T~ZLdcEQ!jk=+E(s$k=reM8EQDyA~E<@vNshs#Us~BM7XXup9oGSO7eNcLWvzT+~ ze7nsdR_c3OBikXEJ*xP3Prc7xE|*2A?peN*)!H>Q74-MxG>)Zo%(}dh_;}yh^xNtv zE6Fr{kQcjv1!NvCyfSMwdywh-u5Hyu&gj0%%)9zN+@_UXeMJgcj4Cynj#Ask?tHmw z$^LGpLw4y0ogN-X))!1%RVUuWS(}Q^O!|<>Ez>Fe9@(5vJ#4;Jx@N9ruL?fzH+q)J z)OsbWSxNc{#vaZV?xT@fs=_&P*4JE><+GQ{(TWM(`x&4YQv3D<#$qOWV_R6Ps9x$JK@TX;2L3RPvFjP>>QVpd2jLcVklDB2P zF#}a=FB@~aL?>sSTe*R3pZ@KMy6qiWKdc*hWVM#g?fLdz+`w6kl@HtGLyfpiq7mn2 zOmZg+hqaFEd^vzy__4QQ{PJV0lgc6GNk#KnYx;iOg53WRbsmmfes3R_ijoGEO4?Es z6+)aGSqWJQAv@XGD`b-qAY~77++8 zADN`nhTTHKI^z2|_rM<4gOfUoaCYjc?iGjH@Cffm?fN1coH20j@2fG`3FPOaGJ-O^v^E?u&H-`MX3&Md`{elBQGfapyz_ z9NideB&?iEx!*)*O#GOSQSHgj+k*+HeuQ7oc4sX4X)i*)P7NUEF80k;dUG6KOq(X# zSbLCa7R4T(xj2IsTdXY~{GEl{IR9E<-ZixNJ$oMs@4LaCBVL6UCbdN!ICgq_r_H9j zxTVN*s)Nv4YIt_a&oNz2mAg9XL&|YVD^C|>ba3tA{O9}fOrKg?Vi&fz!lk5-J}-YQ zm^iG7vu5p7-y-5hY)*)^6)bv5RkJR9H~8$1x?H_pK76`|tQzSz6-+C~PiKw$N|xB- z<6gV=4dm~jQ@igfJ8xG;PB*_VG3NLlria?fOiy*=*S9(^2DR_UUC2lJjDRtnrl2yU zXTKfoa&jtus7=FPeQsR`aM<%8h4fy6Ar!XJEy9W0WoTC}* z_MXc$9>%&Ojmo}XJZN8-%jmE`98!|HbaTn2eCjT@dY)BR5stTg{mkZI0h%#W^;Jkp z9+vwXeSLjhA@)ps;v*cLk6d=xUVkfc0qfb{&g(iGK@z#M2ZLMfh~qYw&Y9M^H2hoP zi%a|msFk$uo&{Gtv4^SI_#LY_qVpuE$oXbG8Q<;AyZ?PL)|Gj_PIGoHeT#J3#l{Qq z%-`4k=5Go@mM_O14{F;HEkS{#yGtyILCmH%{#+-pTe@-0{lBqvQOSP$b#ZAlG@$rT zo>C^(%`IZ*qoMDM*mrGgmH~SoRZnhZ&(S8|!!@5LKE$3S7mH6$nNNr6OmEf&hm*3Z zSIBhVB|0h8w`{r#dd%~<%X8n@U^&VQ;OHj6|$UBSe3pYzMl z-FW^``(8cYI;^~4iuIp#bMkGO**lBy8VYkZ%t!F9!F>ehNOh_IocM?j`0}HNt#0xa zC}>-_l4E5q4lmiwS36?}n;Bd@A75F3`iRKu-`?@K?#{_yR$dSAhC-ovpZDb>{e66+ zgEg-BWKq!GsDouFS3q@CePIpxUU$GtZE^xNG7+zco>)TNZxl&PY|6(Gaog1C#w^qj zdP#CEPX@`|-R~sOltb4l{58Imaf06XqAct=qZ-e(^1136Z$Q5H49iT>4=2JR4^C*z zmO&jOBRr-fDLC%F*m&ft40=V{LE(vgI-M$;^R@Kp5j^GnsxO>5`?9fR{Bq8GgvAB+ z6>#1)eBw^~M$UbNr%HWk*AeUZ2LIGy6mlvlx)nqlZQ&z{nT+eVzZ@l*J4oD0b`-lG{@ znugA09P=;#G#lT~eE8U8QarVq5gHh|!vG&lu}&H;u|`v8Tsl1Wp($~BJvXzsIGDz| zwNI0~e}UTH2{wNjMDf3^Aj2qgY{6lOH6=gd7mz1!(zW8Af5+qO;9 z3bE*l+h%WElxef&+5~%h6?)KC##&bH4jx;rI6-qxAAV~YX{D`6v54EPpC|Aw8ZD^j zT(UX~-3y;|Oq;tEE1S6$09rkupB{aSP_?jf#r;OS8KT}+e;&Y$$~QsHoooz0Ei4%7Lc zRx7K$E+P&if$LP1<589?U+4F#bhOZ!i_Twn5OW(mDY!B#l@=DQ^gd@Cfs9vo+Ab&! zAS2Qe{G~jxD0TPGof|%T;5RdDl9FXS>6HF-@#5k*xxi&F5(zWZzgZMOW|}U(I8?2Vj?b%KB&KYK+NW!2UFY^i ze>#`Oo@w(%15@4jGyDDMjiCGML?fe!N~6O?qiZST_@0-_-iwc8ov?J_Ex$a;)BMJ< zWT``B#=^D_B?V1*N_=vW%w-=kwDtLi5CepVj`0feC?!%aeWxDZuMy+m1)k}Q$VI8Y7q2$^*oik9Z(q8@v+aKzfg+&5&J+z-Y(U6eyH8cK(Q;{R9 z+FBPUk*lNbgiBqXX75hSY|%Lk%#Xy?;8I7F-?iNkNSU?uMWIF6q1@WiMe{lTN7%S4(Q;M_ zQpw$|XY%SemQ8rkE&k34t8v{sp4qEQo3|b^8u)tF*=&sOX$CzCSQv3pAK6E{jMZr)Bebz3+Ytw>$Ik>WySQS zMzc(0XDP0}U;6Tga~7%&KfX8ZXe#lklk&UJTSndD*BiZ>P)WlsJX3BCyMU);#AT~@ z1(B-CD&e4HU$W`l>pg$_ol&HZyZ*?fBRErfB7e`Hbh?&nOtUyJhg$S08g!`@;^Njf zWgPv8#{c5I$JzVw8%wQYoP7;P78Pf4=4`yZ`ZqHlVeZtG?7Jpb@LeouY(E)U&)>{bC+s{H7z<44UloG+cUH))YK@ie;@t{|08!}o8_ zS|l1m&Kl|(Jm^S865AfgUcI{+2Q3pW{<|`pR&81Qd-JG14HXGe4d69I4mC%c=dTV! zkrlD_yk-vcL&*jGCV?gzEbpPxue%K|di7%0;&xvorX)R2&3-?5&^@?j6Q3VJq9Nrs z9#~<~*Q@-O>@UN`&O?jyUl~yau7-2u^H%I{>9{H8_(R-2VYE5*-F^HvmbN?HZ^Np? zzZTk^>!My$d*{7-k${~0l4@6PZ^NFs&zchQlBvLYf%n`~N~oDUSMi^ia{BAL!xG;Y zanxb>ae=$}HnOzz=c#pkIsbG>(2rp6hkXs^Y?zPWU4#1w&e23+@gzytH~5D{;i&(a zBBEU&v-4L@2~iZxc5dAA1UD(&jZ<7)K*~b~4$HfHW=-peNP7kCVC7Qqa z>UTbhLtM-6ecCqegS@9jKdL)(7$1*(TE;uzNJZ|ZpV+35j?SDA8n$ZHo53RDY&(+rhPw$jH&!Gu<{GDFwPsFn?}O{w|F) zNlOkvd`-fgI3of*2;x#aknBi)nh1WMoF0tsOE;>Xk_|;m-nFY2EKWtw{HJKCcP1f? zh9UM{v+DWgIFCD%l-wR)d0i`*7+bUyUq5*fTb5Q#2$IYvKRT4q{F%|j(BsPo6`MA^ zcoeqkQ znTKCVP99F`ZKmSWXDiFTFD71#B^3IWWTNh(L_+pXMl<(5kH@~HC-L3g#|h}!phJRw1baX1YcOZSd<5?r+(&SZ zuE)Dr|a&{?)x8Y=xG4aR6?e$M^a-czv=f-2?TIJm71uADra1a(Z z|6PW|-tSr{oac>)XSZB!b&I7Da|V{T4aO7qi&yXR$cK=}PXl_lG+yQWhdcNC>b?vd zTV`^#{9P8!UsaRSWOcqQ$w?+dc|)g;$!8F{MqO2mr;zjCNaGhP9vV|mqg~53 z&4|Lz{^U@FZa6Kyew5L(@p&Ouhr|!2>|^vJoN(Kj+52(Bw^n9f!!bK&`ZII31vl@G zpSJqMV&We5T@x#gQef^Q{jxf#+3_Z%|BRo(gz|nWs?`5Q)p0i!U4OOzkD)htyUBLg zM6rVwt)H>U+$jRN%~l<}doLBW@zmK|ldi^x8oZWO*4ShJ$Rc`Wy94#jHmE;Qv<1~2 zDwz07WjBf)zI3~yxE*W#EJkO4SYfMc_r@EDJ#|k^uoriYKvLDqB=&xbCSEH(O1<5Z zg8s->emqoKO+WiDcVCgY2hV@%pzg9j2S=}bF?p7tDfJoub%Wk3ZhCMcpI|hq}K0E>wCE4|O#tbVfB{ zDt}MR@J}V?-RdlJ@%tw9Mlp5yWtj^Ud=b#I|LKtb(~n^9hkXs^Y?zPWU4#1w&e4sp zW7YfL-NogzRHe;MB$MMcFLh2gAEIJ+BO0oH7ovr)m5!18LNX{<`DfX;L^^hNZ&LNA zG|IDZEU4&EF5Z0Qp+Ip#A^N>$cIT6d2$DTKAa}&1kbe36NznX$Ce`}-D9817H6E|{ zIdV}{2c2uaDW;yVkC?UI`;oQP4Xqac#dV}U8P6`B685?;nASGWm|ycVjf$*vowDIp zCO&yH&4%NPP$3(>bsRk#+m*_#=ID@E_+IU7j($YfD(cSR?ETnlRGQt_?#sxuKX|A? zge4p8(>U{y$j}M)T~nSr#YlF*licnL`No{1ZKeuJ2Y0mN%*%Bi5`%u^(mao@gI;OW zqwnjS{UI?#H|X><2YnNi_WR`KR<&H98e$jNgK4&atQ`LSNvm;}IyOtta?DtkX zUdR{qe|*q-{H$P*X7&2rx{`3SZcTOPi0h9{MnwCYpjJzu;n<*)t)iC+9io~Leuc@O4}9r`?e zxfxg2i(lvGZK7%6VMc3K#-Xm}fTr-ai`YW0+kr>3h7JucFRQ3JPYqv3FYd{!q?!gY zBhM=eY3q*Tlk`P3NKHPC-?*mYpT7pa2V9NUFO+VAoL`j6e!cF8Rvlb8^Y!tANJab_yRT_W9?M<-_6P|d zEKCUza3bz6{(IL_lH>H9IeWvh!y9I=FyBw)>fcE0lD&p!)Sy2ATVHhmO**0=`ZAVxhav?l+GM(&B}!lZd0>Zwt6Q1(hhUE9uXShOpcgm&>L z)mxNfON#=9X#XMtJ}vlb;ERBs4LT&~N3i$9z6NtP%t!F9!F>ehD6T!OWA41CIHy8O z?0rfiFsQMK&D9T^$cT$=^Fnxz1;7+4xNb{qla*vb-jm2-!z!DcQ%7g~^Ij z9ae`w3Ie{04U!TeZ)(X;Vl z{$q>|iPw1~l`;BJam}-DW|q51$|yIxuN_dSW9RHUQ%=tuh>sv5XARkRO<@)ndmlOd zm^4ebE|7@WDJ+dEzCgWqhoyLVA3*Cv7lw;Vxe)F>tNiG+J=i*xN4`zQj+z{5UH>jm zj>;MxvD372M3UpTMK4ZuAV+`R-z{!@7OTGhJFQ(NhJJkZhTnZl0{$?xOMAPeFA~~Q z?;gJKAUXPKSzD!87}{h}du8%?2{n&%ymNZj0h~5@aIQs5I6k$!NWsh`mMTm(K3Oc# zK%*Nss>~HXN44F5wx;YX#P0U%9x9I3;YX`J_v?2YB$EamBo^H|j|2Hu%ygf61#|t( z@7q4F2Y;W`Ci2niHf~z_qB%3+IzIAkXr6Ah0V=i4%X581Dfs>1(}KSSz6j{qphJRw z1baX1YcOZSd<5?r+(&SZCU1^kb-S_;Zx(tS-+b5+Kj2kP}f&b2ek>y5SxY|1I`?%UnwmKHpsNlHMj1V%TA@?{F~o=gZ3B zsxhVt!TQ?+En~?l_mxKvmxYnjqd_D6f7X()#$%r9%lUDvQV0FI*Mly()@-L}u8b45 z=VU9c-itmeh|xiBTQoy;>AU4=YUupz;MwnYEI@9nj-AdoK8iXfXLqFS%0y+yW_d4J z5QT1@IT`Tjbu@V)uCsZ;(^}kcLj2mMQ3Eup+%{{~fn8|K!lS9+gbCTZE&XEuJ5Q1w zD_b6S!x;Su7B}@=xTg1sO1HJGzuK7w}* z?jtxy4$ps|wkhtx+9RGipUOgsoa=!J{vEdHo{r0AzdQCM_L^yO#zjRu^4Xi8Po*9A z%`c3e;vYvcq%F69nX5{w^?kp8YIh{Z7ry`9TWwFr!mngLPSK}FYHr2+yk>_b?uGxb z-Q|f=x1^iBk=H=NANLBDyhued{W)TOJi2IO@x5jDYL*kZU1#c!Mrx8UF^}yg^DI%R zpP%VFODA%;Me=QNg(n$NB0UA$VvtI#zfMsp6MPVo&-LkGN&Z$%Wdui;yo ze6#0^=b?d>eGeuZa^7LvUgllHGrZaZuR3L-a`6`S92G}A35*qtq<3Bwl-cqJlC{Fp zTQ!2!$kg(WvuU(`?c}VnBKH~ zbtc5()AEOEOH#?Z_4!jIH>DHHea7pWW=9h3`OeEOhlL<69=U|XwK-J1(eiiHv~*Ha zt=*Viqk|k;uj@KnZb2J`hH}K4wj-S%=031@)^!T}*Gu^P#%_>LUlW?uQ zS4>BZ$gtY4^;_}jm^+z{k7!pTIn2Lgf4nMYmrll1;N>Fj1EZ!$6q0ieniVo z%)ha5p9^_0)X3~>G%Wc5GiTF+6+TG|C;5|T%`eQmMqPAN?E7?$(D9N58%}7uBEG!p zV)w@x*unDg$?P|J_=od{XOYij=y+5&x9%l5q?;!9E9spt3Y%HIbMWbL{I)LCbN(bp z9H?M%DY8(F{wOmQC{i^-bDF9QEI;ieOV>>=OPoEO2w#7*l|#6v1NCL^r=@S9v00zj zI$o2Y9-G~qub&mbJSUzOs)?PXZ`;Ns4xK+v3-&({ePnu+raX$(YYQ#JE7zMPY5WNx zx4CxNsY#X7Q66*s&~0LPeL-Pt>X{gv`umAd#)(k;@k_aeyqzyzUv*y0ahd}8U4MPk zs^Iv4IX1{2L2d;6e(-6*UjttR^lZ=}K|g}MANDnvvtd4hcMa|%I7dAT>ZB*1y^YQ5 z-*gsl-9-fq+w>I0?MSkz*|xEF`e>>DvE4)2!C1>G>&(a3*YJh;lX*I>hSImE`V4h^ z{K&jv;Ke{92 z;SiH!D{m=|Wb#K@l|D9 zzxXWEMJso3er3%$N84sBTXA+tI88mh=*hf4s?;K;ji=9BhP*p`-E`VEJEG^eYUt>p zMR;}h;xGgMB5KfZt&uhJq`SRnKliZpf+Z2uS+H-rd%O=d=fC`V$6*H=@$_Zp z@KsSXGRba(_A&o|c}U2yLH-DGBjERgPYeDU_#&WZgANJ$5$yf2ufd!R^AWsja38@r zTEE0zMO63>u9G<_5$P3#Y!V|%?oSUvL`_ERsiPycJz73^d`&nNUhBQ$8vjk4BNd~E z{ne1Xjo7%Sl`HABZIGK%tbkU`bMW_B7KndH`QDYu4WZ@F`TEYO9iq4DPJV6J8i&sb zFBWC;km&INJtoIyq81*)(v#+kBq!X=(nGzgcRWIb!dcjVCz1 z2;P@|<0nVY#{F|vzaE)jhB8$DGWroV*x1eN{a7Ju!7060lhBKkE2KGdHfO#&*?*;3 zg=|~;i;sVWF;eACf6m-Tmz6fK=O}gJ_lUWtRsvefMH^lKP{|6-GB|dEVD!<@ST|?s=zhnA@p>FGtFwGz zj4B$?dTrgx`EMlTY9S8^IX1{2L2d;6e(-6*UjttR^lZ=}K|g}MANDnvvtd4hcMa|% zI7bW1x0GDdxrsgG>ZDra9H`_%^%%*reQ4ZIV%6ZXD6-SkuujP}0F^JdJ3aqW0{&U+ z__(_+hOV!(Dd|&lpu&|OpH-Z9MRs*Q$&U6`gh$=5S6|lzp>uJ0ZVg+|1ko90QlFRfyHT6!`iV3PR; z4}C7bM}5L+RuMZ;8zUNnbytD z*?qsoEt&bKGV0lcK3`MfKlA2G?inUzB;uo0;va2vZtjXdquyzFBv#+hz1@MTZt}R5 z927`Cf0*8P90e0CjgwzC&fARgH5b`7Jy=H%?rNVRdN7uX{3!Y;x>pCEyi#3xzRMb| z4S39BJ#LM-LYk&j&D(;szAChSd9@k2#5rG8Kbnf|hda;JCdNIS0pv>H;OsL?)zH>Q zuM6Kvdr+fgtDnaja{mANzkD|2Y9S8^IX1{2L2d;6e(-6*UjttR^lZ=}K|g}MANDnv zvtd4hcMa|%I7bg{6$I8y>cmn%fA#9M?<7Z-dj2@3qfcJ_TD#z{U6Fcc&MV*0JNdy%)bdh$9UO_J~Gx8I~( z7rk5|wC1s{2I?&E*?2A74>j$`(oWO!MBF;w(krK{p}rmWgG~0PBHz{f8NVOjnQVRV z8lM>{3lbI#5syJ9r}Hwt2+m!vrE!ASjaPPt9w`-5gy7wc!THpn0S%Z>cc?+2e2{59}JK+gsp67(b3`(a;$IUD99 zc-P=Qf^(F(|G<%7-RxqX^W$IiEpQsO%)utU4(Id;x|Fn;zsGaqH| z^I+e#{su+Y(98Q!b=~()jphC%KXq+?RrWco(9$^L+zuC<8DMh4zW}4d?ZxJuIVm@zLKe!;Ns&0-@^h>fLyC$lM|8X*M+S{=9b* zU+QUou-t+EvMQS9v25CN_uaUuOv8NQ>Ri0sdU190b6*rclN`JGu!D;J+?oyTj0jCDtvSE=&#tBZm6trJku}e?7zMU04d!f^kKkQ{`v}g_M7z_Eo+))9;2+ziR8Y-)~+HzrkF4m-}2N7Ds?rF+_46R`5(@ZkeVsrYt_h-9_hF@ay*;dQ!36rZ08r8((Jw_UAcmtvxT*E+4;y~q1_xo6JrwM*q(9rxD#3Z?ZR~C96jZB z_Ds7}O>e)}M>D))sN|ZP*CVP8iTG|UsrjK>P?PW_ao$f`&~TD@>W{g`c z!JG~A5xi?~AHg{??)}xNJ+T|>E05}~T;zwIScTm29r7d{^Jfcu=W`(b6?L~FA4Fq? ze(O`b0|#l~k~g!>H>K012G-ZMa{SF$o7bnA-qE3Br@fiJ2tKLE=KM1k3bOg^*tgf% zTy2DM8k2{lYg5+k4)o&uCw2K}CVzy*WEQG%awD`QBz>#Wb_q16H}N~ir^VtmOBjC* z?+_Q!mgDq9R&@|6gKGJO1z@DT1mRg2dX&gViRLJu~$wG3w_}$rQKerR(6%XJ1 zim)cZb5k=K3WTYw+131oBTo3Qr$+!j7fZvr2R3W%RHNGMWBq8T4bQr#k<*YxvH2UJ zr;~(EVF5FTcT=vFQ(xtnj?=#b(DJ#P`PH;f!x6uep56 zhGc2Vu|=69bI@7Qrepp>D(H)rdaZfnKC(Jyhc}alylGx`OjO>8to!zNUfj4JQQ6D+ zJr5^0QrEif^C=fA;;A;(D*vZ1@hk|{Vf?lGbs>L_laqdNv)%=Z@s)ko2Kw ze8OHwImEr}>c{n~)KHT|X8tiL1vIOq(3&%6W0P&yVmR{=JruKWZGWH{O1o~@lJ{^o zvfFLCQ%%hXZMxmK)n>CkZW;OUCF=YNs(K+YWMb-4tlK%9@9+wf$u~8+rtg;`t?yOV zbY{wv)U6extMsIaho#B-3|=pkRyz1W@0K4rde-cU=H?hADsZQCzNZ7pAFPtxP*s7~ zZmh^1s^#eLJi2Gp;w+Hr0-g4&;kKwpxjXdA1zRK{CLA|&mJ%B2R=f1VY&C9qVjume z&6Yq968dY0?cbdJ zzxfE>HMo!99G$YeXZBR71)FlUNUki^z#Ht#xRpLwBgg7S9htedP0xu2 zD0O6-A(N~9wM6nv#r{AfD!+P@c*ZVt|Af>=CVv$9tHWUPNG!U@wa)R(;WecEqyg*G z{_S-Tw7VCAe*NlWd=Z>&JcrSSf+NwKskF|$!Vay~+x_>1J2xy_P5JjU$4hQHoX z-OQP@apB-#zPn!_x%R>(ymO`jDJ>hHT_vhSF6d-`WzG@bUtt$Lfk?`=HT(XOms7Ef zwzfs#R5OH2w(K`4o`sYjT3xugW)V`+olxv1x)DhYZYep)`5tD>_PEWP=S?1LYP|ol z%$s;yx1dWZrpSCj&D{B}xs>Pg`cFTXIg@AY@49*3A~e}P&2)L626@oly1QRbm8gZC zUfyl2g@%3GcV5x5q|g0txewiOM$o5)9whYFpmzj)5y<&NJ{xkikcWgE8|055Hv)b? z__W}!fiD7jHt3L`AHm)a`x?yIFdxCY2KN!1qlv<9RWmbMu==rvd3%>@5Gnu2{Tmdl zNl5DWvFcgIC{$vLjO{KBB0pQNqwbdg(NLFa^V9p=NE^=2^LTBCLV6BNly_G`$==)Z zw8kumvGC+O$Ja)ni-OzOe70xCPQD1=Ky-8-mt^+NFr=3EUye=a!-U4^m!i;%XL&zV zueqUcbsyI6Z!f+yc;Rgb>Ui{+@z<#D#A%E#LJ!)T=tg@wk++vZ{RbWF33q+S?N(Dw zdw?Id?*m=qqZbjY)+e+@-HYLUhe2WD%9FbI5&4R`2 zZPDYYJn8vT&PZ-++=@wc9z=9kh0LgW z^jTKQgjha&_2r<9CK1{kXTWtg3a9z+HsjrBjG&hdeOl;2LVpc!JG~A5xi?~AHg}=>^rweqWlWZy}E61 z&V;q}mEIed=yOig@TAh+HG`h`bMKq!oL)BV8?R&fwD>6BI6tQciF+)nnEo18*EeE% zN7%dI6WbTr_f%fbrqZyG^=-&DGu?K5KH(B^*7nInUSv{1LC>#Od8< z{E+mUR@U$T_LOH?(+^*^rE=yYtQE04a(a;w>d2gwbmO2csS0#5xUt_E zc_rxUnMxJXi~%_s@>30KC-3%q`>|iS`e=qoLi!5&Wa@R^;DIvSxiwmLHYvvOsnt3G z4+`<|3l|jYqTqW6^W0Hkkb1`MN@IGpWAxl31x~)``s$J5o-{o5UHECoj-%K{ zdX`$(NId>9vf%VfVuvod9UZ+@l=QFP54~*Y(?Smt`fJcTg1!jk{2`wWxmw6WLXHjc zN01u0X-XZNYIa9?}vR2=4_ac;9Z0J2+q;>fa*ri<`#VNXH*Bj{uZnk zByOCw#*2>ct=Por_tU;%?8oV4f} z(e%*$M8#B1mC4nPx3{x-$Oc1`3l%v&B=uz%n?E`}X1~`iO_r)<`uQAxjl)rnWPB0K<+pVqPwxs8-WT&)dDGKI?F(xLOSm_*F74 zKBN2(JMhJtC&7zycjLa}FQ03!Gp2Js6n)NhkisY3kKg~o>HWPNHPHyKbfmx=0Zs(; z`=OT&eOl;2LVpc z!JG~A5xi?~AHg}=zF>`^fbUi8CMe9u;f>%_-fi_9P6Q2(KX8}R@25c~ZG_Xy#w};v zm_9AuUOPR8=|MiKT@)+TypQnJe>7)$M;B=iEfoDZ3$Kr zMXy@}GR=MqkuISd3z>HSxYT*xe5iVJ3Rm&sStc*myPWQ_hjUms^G{ zhaX=cQFea-%oyS86}I2UWh0yR~o(CG_d1k zR=q8*ypq`YwR|Ri(A==y@v$*k+L4jhc18mOw+474z=?oK$B z^M`ykBREIo+$xq+ zw7vg7PLWslah6kbYG5VHDOznXndKB|%DFR~BHAp-@2Xf7Lk>A5GMpmX^JReH6j9NN zBE4cZ;X*HD<2(|%2oTBolmsw8HH?DUqr)cuy$0^P` zERe*yyjzONTgmcKhdbY#jL{k1+YF}&$MW7%<8X?onZ&)X98MACmYu_Jif|YAm<+?; zDRKPukm2vls#In9I|WxKE#UBXXxG7q#oO1qBf;F~?i~IO^&OFA`98!Yo8|i$`y{b^ zADe;)4BrPA{p67!D_0>lWA_=p&p-YVaBF}!0-Olw_d_om`n1r4g#H@zj-W3BIe*A! zL#`I`kdR}8{1N0v!0!j27W_5vML^F69TM~-*!y8$gE<@KBY4-~K7w=PHvKb^O{ z-#IMu`UAt?G1C9Cox|V3bH>Dr^o%q~>;|o94u1!~(lTZFKK!mVEZ=7X|JnceKI)+i z-v{S=OCQqFRYN?}S-uYiju!BbfLjB+5#U5XzaM(p(5HnSB=pyycLaSA$oWG)8*;Uf zhlCs(PB_m_i45x^ylyF_+aEdS;d%45x@ri?X=M@OQGmnX&wxNQq3Azf(UP zt;gZ-&^M@hay={(U8z79}cGo z`;1-HTem2kh#2}YoFaUzN15Ri(FEB$A@}6c$iVVV^&CzSwrpO=aEj=FSz_YiKpoV6 zJUpMnDWc2e`|6^8?nmdR20FCp7ZZud5SCM9V=|xR6wQ$AWI09cTeA9B{R%*v8%p9{ zZ^|QUmz#WII7Qm4kFlJhRhsTBr$}M(D&8iQ2%`I^TIYL7Dw((KyQ-hRBT;R>&2WmS zeEK&T4yOpq`@}JvBE0qH8irFu9UC4xGyEOjt-LIMCw!_d%ik$Z&Sv;KblvU}v*?K; zk!@dS{Hq1xxuEBi-=SZT_h2<2P`}r}PBHSY0s(L^!kpwQ<*~#G)Va1<6 zg*codI=|)Nwb9sg(mFqJki#jWjbEG@P7(d^e4X~?+i|4)?M8-EL_b$e9@%%)2c;j| z`f=x)Y?5(Wo8=VQEa+l6Mcnn)ET?G4{_|gFE)PT(CL9#*{+>!gM&nsdQSjyOET`y) z0cJTxNKnIYY{-~oe&|p%^$Q_4Jr=I}_&bnPWwV?js*A)JP7w|=OlCMmn1>!`I7QT{ z{@MhFzay*M$ntlr);O^Io$$|Z82%2eIkLvE8@VGFwE%{{gO_)Puza81)(tG*=b>;c z%lEl^@(9EC!QQP)lrBs7k+NDHhVMgx4+&g9;IRQm3;0LCtpVN$a3Y}J54~*Y(?Smt z`fJcTg1!jk{2`wWxmw6WLXHjcN01u0X-XZNYIa9?}vR2=4_ac;9Z0J z2+q+%Rd1G4R6NljXXAk?RO1-Ia*8}}S+JZUk`&&p^rRH=XX&$?qK*y!af)&mr+Z$H zK8Eh<#Il^ChN%*Vawe3MHiM!=+YK&~rALo2oFaTTcooAb!huR{45x?&`OOX8@GzEi z%)L{-QXq#UY|LReMYz;*;!X~y2)oCsF`ObQ{BC7NZ$}~#tB-Nu`5cP!o77HkpUmN! zy_n0(aEg-NO=3Ak2ePeMPSNJbV=Sj=lI@ce4u6M+WQZ~R9lX!6km2ue`udV741Z@t z>7LDlNjykG;AjE=2)H%>cq9MgL_ohEdfCvYg&rjI*PwRe$|O z|8xn;(EpE96n*zq)9P2n#HikP?(9ciL?rb;PElp+9MP`S2qLz0_%w%8L`y$wGn^u9 zIIM5|tGtNxPu+dOP`-pb`F6+uxpx%F*Z6sv!zrRir(9+@MK~bai+ks@0-{BQ+>K+z9yn z;M0P?2EGXB*`Py$egu0z>}xP*!+Zqq+Q0kgf9L3L%vqLGG*Ebt#nNC*f zjj^1fA?ra&hEw$P^o68toylZU$#a%dH0`o=+k+oD`E8@`B;}U@rSnhVO%~-uRF2 z^R;Qbc+>K+z9yn;M0P?2EGXB*`Py$egu0z>}xP*!+Zqq8r(;4 zjz-acoT6D%+w?Pa3Q>>svSEf(RNh_Aa*7<)A4WtSOGF(xHKGis$ZY+xu}-NHV$k1p zG9>aGAuIeCP7z+dP>|si;l-gH45x@z`QEV6=-fv_>=vkqbEgr$&EI`FoFZ(nOM~GQ z;menwFq|T)rA&I_O~OgxT`q;Znov|5*m8cph&}Ni4`Vq+d0(GQV>m?~@{d?fQGSRl z%PH#W>TTxmcX0VNL59DBw=~@-;_!FypJuP641Y(#bF<-$zX3#g4XR}LJ3Z8%;rmcN zwT}$nhstp^GJGF;=)yS5_epxHT;Jj!ilS8_S-ubOvw?dIyjtLl0v{5%e!yb`ju!Bb zfLjB+5#U5XzaM(p(5HnSB=pyycLaSA$oWG)8*;UfhlCs(3OOn&n`(d6?b0Y~f^P7xh3tzh_gm{6;*?M~$)h8mtX7a)w$AQ2YCrpy_l&>Qx~Eh<()K2W;)?@fjGR#WcgaSl z*#Ssw*H@Om6Q4Qqv$!>kDDKkpC^sxZT!nAkV~&(GI$0I^ z_c@%Lz<*dg;U9+`_}Rcc23{?2Mu86rTtDEk0Y?k?N5HKC-Ux6apx+O@Z0OTM4-)!o z&^v;@2;}@BpAETM$U{Po4f02j8v(x`d|L3=z!w2M8+1s}k6`bIeGTSpn2+FHgZl{1 zk!z?A%PF$Cu&b2e6x|PaG*e%xka&skH!_@}!Yt8&o|&1X!eW`>#?%uet$q!|DWY@r zo-mvuN=|n(oFXj!M;t`_o;kYj`V5#&a|?+2e2{59}JK+gsp67(b3`(a;$ zIUD99c-P=Qf^)>p_mkxm*=fa{ywn#@uFq1?WjIA^%Zt0V+{z-_Nt(M4j+7GZtw$M7 z5f)PoWjIBcOPu8t(J!VSpP4y$5y>&xUj4l>aCaEkCk8L8PEP7(gW^MK(L(aQ~6 zRrDKT$bDl4Ro)g)deC>U@cek!@dS{Hq1xxuEBi-=jhgHKbBJz$ooD%G~y6J_y2C^aEfT)h`c9dh5Z_ z&-v)r6B(9MR3p~Ja*CRL9a&D%r9C^dS5M0%fuBxyKUx@#=K1taR}MUY6xO#&=V(Te z^`GyU))(v{_NN~YUOMQ3Brn`Q5#1G!A`a}lcJ-Pk((akY@^?jQ$10EZ2w19sE+#2AG04DM4wh3?rLD1t;S}X-oiAfJ zMJwN2xXN&ftSUqq{tmTyUdZrwXl}+$hQC8a`ubS@&b8^EM7f_Pk~anUEPp4~Zau^I z!7ev;F?=5^UL4NweQ5iR|M)&M@4~(MJU8;-LNv?wf&bLN|3%=x5Ww>X4m8Jz+(fB7VwY$aclm^8v#xP^!uTg4SibZK|+5GdPmR~ft)|&vmsXtc}U2y zLH-DGBjERgPYeDU_#&WZgANJ$5$yf2ufd!R^AWsja38@ry0JHwq5O=y) zd$Bti=uX=j|HBA<2s}&f`1z0@X%AUWQONbEH4LX{_;=gaObbi$Dp|vj_q8cW4eq_J z;SfmLFTZ6tMbyrG7Q-pR!$_Io6k*A4D-&)^k42BtG8>{@0|=UC%k3T`ixyw$x8vvJ zl#}MvW~Dx}C8l11{iaR!WS_}E%{jLn$$jp|2u;J)WH8m;J*;S z^9K$)@Uww?47^(4i~=7LxPHK61CAE(kAPbPyb<6;K))Y)+0dtj9whYFpmzj)5y<&N zJ{xkikcWgE8|055Hv)b?__W}!fiD7jHt3L`AHm)a`x?yIFdxCY2KN!1qx8uC>*-AV zLdxDa9I}sfEKQ=&*moJ*nXWCx*ku`^qKzySlBiIMBI{VPhOv(&gNc(X8evRh``H;I zGllHNHkRK#hsXbLKj)tFzTfZj*z8oCqBmPkU)|kdvGni{%6(WZQuI$9nlW@lv~Yg< zHc4@c#vPcfIYmt_6%05sJXtJCXpT%-QqL`jx z_T04PRr{Y0Oh5VFqlakW<*N8ThC}JtGhM!0D)DCFJHk_gUj#1%JbyUs@U!6_!>ffe z3Lg@#A3QcVTJVqH*1#Kq6G6Y9UN(JNdXV(j=pE4)A?Hs%n_Mk2#%RFHeA*# z(X>!)_jq|AY^UZF{rAmd%_%BhVVmX@4H?%z_wdX{V#&4qBJH1y5s_;*q`2ITkw*r( zDo&9(eDHyvEKZR*t?4PnDKdw&$y1ynqiEuNSH<6%_4DtVzthC+eSgK@sg@t8_&cU_ zZ@MD4rME0LZ;#^d7~8(Ntoc5D;_hm`&!Z)on(yN?DpK)%%nLc&HaLG}d4zjuzK?-_ z4R02{BRnmFB*#Ynh};PKe)hEN*Vv0N&t?wE{D^ly?=|k(+>iLLaUSsL z-}z@suIBG-IzKJn;_n!B4lHYPvGE{ryu(7p-!Xk!UU;PVKE=HX)lqz(;q9|E-{;k$ zIg0ONPJ3{rq0d6g>-6rb;`@B|_~Bo}n}zQPPYr$%yb$pG;jqKchIXFYfjNWp<6Yl z$gX&(<`fkVPl-D*H%Q!>6p7?QmnMLNDTdnvzA@9y?Q2ZUQmqsbY-|12RVRehYV^;b%tNY;_}lDru@_;U%^Z^X5$}H9YuvNBAMsz~JmNc=xaT9MXj->C%_*`g zQ8u;MtUyuLx7@O{%R$^>!W0G??H-FWOjcwL2-%#49zXnRc(d>w;iI2H)xsHt4++-~ z9vd7j_(yPS;ElkEpx;j~n?5Z)NcwB^j_8Y!^CzE8u9iF`IX3b~#a1j_h+Cr2F9yvV{;E-==;A%P_`^5h zBBxzwPEza$vD^G&!?6m3WwrT#=Xoa1kwyBOM;~?!kjW=Cf5%*N@WiU^l_!f%=?6BP zdNx5k`EJU?Ft<^{Epcs7|DiGRM*PU@QOzP{rBXjCzK`M5C^6n+OQ^t4i&qpMBpyHf zYk0Hp9pS0LFM<~WoO|F(aBsn(nN90D>_p_&Ezs6pKc{X!M=108yd9QKL=6=L~jq`}_Xzk?> znp1QuI9YRwM88nYDXQVxw(yPPBjvTy0omstj1<}Bsucg;#ZT7jeN=IZ%zc};C{B@? zHNahQip<3YH^VGWk?}>N6`H@(qQoN2-w~0|HGk*C!6AyjV@`b4`p@f?<7LB>d5XVd zco%zpU-5mKcdf4ZJ`W3B{ZjFL;(n^3_&(;+gs_z`!vP(`RqByPm5O+A0!?> z{A+l#@Ezf)!7qXr0-iq{cKF$FkKxtA8HEoC*AE^W94+`qaBJX=z=@#WPcNH3Ej>v3 zYxIuji;(jtpG~foJR~_b@<-%G*!Q!iWxvK=gn2e|Najbp`+2W%&*px_e~t5q@5o%b zTXTxe6m6_IMTdj?9Ci>jFaI{tSJrB)IYk!tV5Q;|nUlX;pg2Y5 zyZ;(0PLVOwB~|lxie7xH`8yl8rfB|-?};Ufzhn6Q=-SH2njo^CdMW;nd46GY&G%XV zU!3Oql-C_YF$e)!k$X5l-+ zQ-fawF9bY)IPCDV;U2@Qg)<5t60RRSHaJ@FkKoq88-Wu+zn@+`i_bePhH0={*Kl2d#3q5sgqqb->2MR zd(HQ`U1pHt`xy69rVL4!^F{Q5MvCua;$6p=jprCYEnZQ4ka+y?ui?$YcZ8<~zX)Ck zc>ZwM;b+4={>-cW#2JMT3D*xE8yqe8M{sN4jlhZc-0%O?%cf6D50d^Gy(9V}R!DqEw2<`hLlG}D}- z#J3Sbaf-qNKctPd3y_sh-iz@$R! z^8W6M<`iA_`%80*PW>8EGh&6GZ1Xz2po`;7IW%cvP@EYcYsZ9%Ja=!=XWP_-;tPG` z{$oK)mb@7u-HME#P}e_D-Zfu7{Bm}6tIu~+^LIwt`&}v(=P#qj{O^l@t4GKlKXfnj zXGFN%HT8YzR${UY3f>;v!aq!k`2CvibN9lmXM2au5a~ZTjow_Xi&*#bojo?aJS5(A zeA#%8@zdfJ#RrMU5C0n8EPO|JYVeETg@ES|haG-4++%pPa7N)n!u5m421g725!@Ph zBXA<<_tVR!PfHJy{u;d_`Xc1~$!C+RB@aoCjr#)5v3zyL{bL=YL{^YjO1t@iqF>PD zXSer^ljA*t2BpOL%3e1dCzYx0D|W2f=NOY1And&7X#P&c^iFNk!#w51^aD`#M6M;myK#gr^3-2wn(y{&3jgXTv>)R|{tpJ|tW} zcx-UA;2*)Qfj0stf_^`}Z2GkHAnC8sJEAW_&YyfXxmxm&tyLIT#uMb5EyKd_=f9KMJbBga{PFNJL_&(w2{cA&iG`*f=?zP-h*F+O~|Kg{CRm^M|mC~l3pBQ;ZTYd(9| z@nz#V#!rh^6dxoWKm2QWv+y0^slhLT7XqF?9CrBGaF5~D!Wo4R3D*xE8yqe8M{sN4 zjlhYZ-%l@_J}o^+`fK!#=!=l^C!bBOmOLamHu6X0M%eeWr)9s!UW9ozb4cb#y!&~t zanI&{#D9(Ri0>%ur=K;a$Zuqy-xR0FcKk<9k$td-;_nzGTBj=hj#1X!U1(3MC}Z2( z0h+%vvEKGaO^=6(XKvFqf5&_CZ;J0@EUTZO_&!G4Iqr(@WBgn5u7l$Hv_0Is{jsPi zqF0`a=KItudD6+^))=^T;<#%NSA*WKdQeD<#6%f@qz zpBAquK1e)%_}B1e;XA@pgI@$M1U!E@?C`VU9>c4JGYTIPt{*%$I9l+J;MTw!ffGT$ zpI$b7T6&Q5*XSM57a`|QKAT)Ec}Q|>_kj#&G_w!!kp3VJ; z{~G5J-_g-Y37S*%bn_z3DJod#*vsPYnErc#oxJq*(NVF zxxZL`G`Nf6?>t|3OYwb-b=iFs-^V!J@V4Un7#D-@Xui+;ysRDdLPA9+p9szOsj+fM zC5u~Q2KEixVR37WhZXCdx41Qi>$%M17PrPMaL!fS8Z&cfqeR86!Mlzx8_zL*TD+q8 zAo2L&U&EV)?+8x~ei6J7@ciMh!_S6$46hc>D11n`e(>1fXu&^%TLW(dP6YjadfD`8 z=|R$8qjyAKgq%P5Y;v{aA<40kKO#56zMnlU`!)6=%(Iz8GC$(o&wGt~Huod`Yn(@X zM^Dby*PNmqd5O0z{*Kw{y_rz6{WI6$zid+c9rIMwlqZV6Gkk1|m{PaK$hq5cQWSq@ zQ^Tr??_gKsny5c`l)phx^FzkSNBAX`;Xp6KCT@u@vh^`#&e9H7OyBiNIZV{*YIZHJHk_g zUj#1%JbyUs@U!6_!>ffe3Lg@#A3QcVTJVqH*1#Kq6G6Y9UN(JNdXV(j=pE4)A?Hs% zn_MkfWeJ0*@eA#%8@zdfJ#RrMU5C0n8 zEPO|JYVeETg@ES|haG-4++%pPa7N)n!u5m421g725!@PhBXA<<_tVR!PfHJy{u;d_ z`Xc1~$!C+RB@aoCjr1euN-+0lSSAKsd|9+`r;E(~whyI;u)VR4|wEB+LxQnCjS}ZU^ zni~^soc9?suRXTg={ecxHEBeLjh*|-_pQ<$*ADPAM<1W3`A`EZZ`E9%i{I|kJeCLV zx@e9@*uxmjKe#`1rs}O5DPJt;ju>WP}x5=iy?jL+WnPrhtC$M|XSisFOB z@5@It`zhrsPLV}qjw{|Igkyb(AN^!w>$)2F2e zNq>#r5q%MI{^YaC)slxK$435$+z9)A_O$HR*o!dFW)8{xh<88lHSXEmkNB@~9`POJ zHFvZ<+whX{!amxw{fRSXZttE$bJCZI*12!8*Pcr^);H_iMV&{MicEh;t6St3nYibe zaqGg4-<{5GH^SdsyYRA6Tbc5rufqoa)#jtdR}^Q=d~q#W@uAFut@{<%$BbCtS@BrR z3By+?j)v(l<`2z3*t|bt=kGICiQT0?_KiE;d#ih*OIF-EUe|fCz`Kqw8_zL*TD+q8 zAo2L&U&EV)?+8x~ei6J7@ciMh!_S6$46hc>D11n`e(>1fXu&^%TLW(dP6YjadfD`8 z=|R$8qjyAKgq%P5Y;v{aA<40kKO#56zMnlU`!)6=%(Iz8GC$(o&wGt~Huod`Yn(@X zN4p!}uQJc(ve9q(^UHtvg^7crE0*#@n4eZTTTQ^2c4y|9zQU;Ou)W zz%5G7h%VM)ddOlCSFzWq4fU3Z1@CHT{=vrB`eTyz)e)g3&sQzDohX_dZJ>LiXPhg! z_FXhf;$6p=jpz9GCqFG-QGAei{P3^g&BAwtrv|?WUI=*paMQ>95f{qAxX zF*8i@p^OsyMk=n4acR2k3dLh-<=NAz|LD!qe`c`eAB?&%SNGPBby;@&d6)Il@8}lY z6V1t-towa<*YRcJImS#49zXnRc(d>w;iI2H)xsHt4++-~ z9vd7j_(yPS;ElkEpx;j~n?5Z)NcwB^j_8Y!^CzE8u9iF`IX3b~0#osnR+WdXocrC63TlZ{pdF)*$>waXsdhDzIYle45 ztAXk~I@sMNwcF!9()Rf2C&PcAVfG9x>Z16Z?nl}UR$Rp~)1pr)9$t_AvB8RCX1}X* zVa1Juxz@77XrI?Rw&w~knR1#@)7k)G}zG2+z?|BhCl+3@=2 zjGxs*HT#{b?NRh?oWQ$|FB{J>eph3^PY4So^45b*rru*1)Wdkn7@ z&M16HxPI{1;Anm3AARE1z#D-RLBF3~Hho%pko4E+9nlvd=TAPHTrGJ>a%|*}$c?b? zXHUz1jlBr-Z03;6k9ha=MmpgnPI*4d$8l#kMF^^ZF=bU;QRmOs`sG9 zb1xyoDx`=zQ;Vthpy@of`gO&5Js394OYu2}`Uh*S;)ziXnumAdTBhchT}-N=_*3S( z(;{zvp;$4l@5@It`zhrsPLV}qjw z{|Igkyb(AN^!w>$)2F2eNq>#r5q%MI{^YaC)slxK$435$+z9)A_O$HR*o!dFW)8{x zh<88lHSXEmkNB@~9`PL|b~&ftgQ50W`aRfX$u9LCG_tz(%&c)eOxi4VQ13yrO{1oo z7yNBrl}C#6nzCTK=5rcPVl-D#4jQj{c;#2;D2|zVC!+Lt$8oV@(N{wiH^>;=+Gf4Q zdoe4|?_2&z??oa%CtdLsjJ|nssuypbOY7w48k!_t+a6avR3ot4;)9n9LL}aGeA#%8 z@zdfJ#RrMU5C0n8EPO|JYVeETg@ES|haG-4++%pPa7N)n!u5m421g725!@PhBXA<< z_tVR!PfHJy{u;d_`Xc1~$!C+RB@aoCjrchT>`vU`WB_n>k2O!-ST5#wb4at`V}Xy!Zhu3>SfjpUe>zgoOt^KtP< z-7U_mQGV{%iqC1-<)2VoMZ>mIR)XT;#jIcWS7^&fQE<~eU-75jzV>re+@Qg4ZuIwY z8z#)wmo%rNYoC^yuaMJiSM%!MN6I5kw{@R6CEjII!f&2p;)!$V=VOKoyzBU~@f_o) z#Vd*r5|1DLHN08)j_}mr7r_ew&mRsu{OlK>xX18n;f%tEgzE>74UQK4Be*s2M&Lxy z@28hdpOzjZ{WW?=^hG}B{6FQh$<>mFB*#Ynh};PKe)hEN*Vv0N&t?wE{D^ly?=|k( z+>iLLaUSs<{cih0zXu<7&5C^GnJh~_nx)@^Q%bcg_4Uj)a!x^1#KGKVVrjVwwk@yr z6t`_0HUzEdC0)NN;r)K`5IL;wfPq!ry36OyV>A!1@LRtlt^1CUe)n#y9Xq6(tlH$6 z<^~maAJy{U>$c+8{+n%kJG;rKoMM`<@Wadesy+7f5$_}Gt3I>YXJKR2Lp7T1-d87} zbQ_6x9bYz{WBjyuMe#x6@x#A{Hw)hpo*Mijcp>2V!(oS?4fhycEu2yKkZ}FrvBA-T ze+0J%-Uyrs`u+5>>C@7Kq`yY*h`tCpfAZPnYRN;AV} zX|u@URK*K6U)CI~IIrf#-X5CI`DcHxTN&pz$~sMj=HWSQ{BOJBm`&Wc&A2#hv%K4G zf#wFKtuZz4WvJKMHgVypa-;8R%~u$=;i~J_AHqaV^<@d=|LHB>9XP3asODqa5vo6E z;9bX;jprCYEnZQ4ka+y?ui?$YcZ8<~zX)Ckc>ZwM;b+4=hF1$`6h0(eKX`0#wBR4X zt${ZJCxU)Iy=?ll^dRZ4(L16qLe8IjHo02zkmT6NACVhj-_M?w{Th1_=Gn|4nIG}) X=e@=~oBI*}HO?cxqn(k(^n39CQHD@0 literal 0 HcmV?d00001 diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx index f10501c2..ef7b0f31 100644 --- a/tests/xc_integrator.cxx +++ b/tests/xc_integrator.cxx @@ -244,6 +244,11 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, // Check K if( has_k and check_k and rks ) { + auto max_l = basis.max_l(); + if(max_l > 2 and ex == ExecutionSpace::Device) { + std::cout << "Skiping device sn-K + L > 2" << std::endl; + return; + } auto K = integrator.eval_exx( P ); CHECK((K - K.transpose()).norm() < std::numeric_limits::epsilon()); // Symmetric CHECK( (K - K_ref).norm() / basis.nbf() < 1e-7 ); @@ -408,7 +413,7 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { func, PruningScheme::Robust ); } - //GKS GGA Test + // GKS GGA Test SECTION( "H3 / BLYP / cc-pvdz" ) { auto func = make_functional(blyp, pol); test_integrator(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", @@ -421,4 +426,18 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { test_integrator(GAUXC_REF_DATA_PATH "/benzene_631gd_pbe0_ufg.hdf5", func, PruningScheme::Unpruned ); } + + // sn-LinK + f functions + SECTION( "H2O2 / PBE0 / def2-TZVP" ) { + auto func = make_functional(pbe0, unpol); + test_integrator(GAUXC_REF_DATA_PATH "/h2o2_def2-tzvp.hdf5", + func, PruningScheme::Unpruned ); + } + + // sn-LinK + g functions + SECTION( "H2O2 / PBE0 / def2-QZVP" ) { + auto func = make_functional(pbe0, unpol); + test_integrator(GAUXC_REF_DATA_PATH "/h2o2_def2-qzvp.hdf5", + func, PruningScheme::Unpruned ); + } } From 3ad2d32d2bcb969f87cbfdf4a741e17d1b0babcd Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Wed, 13 Mar 2024 13:43:20 -0700 Subject: [PATCH 7/7] Add L=4 to CUDA collocation kernels --- .../collocation_angular_cartesian.hpp | 102 ++++ .../collocation_angular_spherical_unnorm.hpp | 90 +++- .../collocation_device_constants.hpp | 7 +- ...ion_shell_to_task_kernels_cartesian_l4.hpp | 175 +++++++ ..._to_task_kernels_cartesian_l4_gradient.hpp | 330 +++++++++++++ ...l_to_task_kernels_cartesian_l4_hessian.hpp | 440 ++++++++++++++++++ ..._to_task_kernels_spherical_l2_gradient.hpp | 8 +- ...l_to_task_kernels_spherical_l2_hessian.hpp | 10 +- ..._to_task_kernels_spherical_l3_gradient.hpp | 24 +- ...l_to_task_kernels_spherical_l3_hessian.hpp | 42 +- ...ion_shell_to_task_kernels_spherical_l4.hpp | 156 +++++++ ..._to_task_kernels_spherical_l4_gradient.hpp | 256 ++++++++++ ...l_to_task_kernels_spherical_l4_hessian.hpp | 330 +++++++++++++ .../device/cuda/kernels/collocation_device.cu | 39 +- .../collocation_shell_to_task_kernels.hpp | 6 + 15 files changed, 1964 insertions(+), 51 deletions(-) create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp create mode 100644 src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp index fa5a545d..dba51eac 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp @@ -227,6 +227,99 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = bf*x*x*x*x; + eval[npts * 1] = bf*x*x*x*y; + eval[npts * 2] = bf*x*x*x*z; + eval[npts * 3] = bf*x*x*y*y; + eval[npts * 4] = bf*x*x*y*z; + eval[npts * 5] = bf*x*x*z*z; + eval[npts * 6] = bf*x*y*y*y; + eval[npts * 7] = bf*x*y*y*z; + eval[npts * 8] = bf*x*y*z*z; + eval[npts * 9] = bf*x*z*z*z; + eval[npts * 10] = bf*y*y*y*y; + eval[npts * 11] = bf*y*y*y*z; + eval[npts * 12] = bf*y*y*z*z; + eval[npts * 13] = bf*y*z*z*z; + eval[npts * 14] = bf*z*z*z*z; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = x*x*x*(4*bf + bf_x*x); + eval_x[npts * 1] = x*x*y*(3*bf + bf_x*x); + eval_x[npts * 2] = x*x*z*(3*bf + bf_x*x); + eval_x[npts * 3] = x*y*y*(2*bf + bf_x*x); + eval_x[npts * 4] = x*y*z*(2*bf + bf_x*x); + eval_x[npts * 5] = x*z*z*(2*bf + bf_x*x); + eval_x[npts * 6] = y*y*y*(bf + bf_x*x); + eval_x[npts * 7] = y*y*z*(bf + bf_x*x); + eval_x[npts * 8] = y*z*z*(bf + bf_x*x); + eval_x[npts * 9] = z*z*z*(bf + bf_x*x); + eval_x[npts * 10] = bf_x*y*y*y*y; + eval_x[npts * 11] = bf_x*y*y*y*z; + eval_x[npts * 12] = bf_x*y*y*z*z; + eval_x[npts * 13] = bf_x*y*z*z*z; + eval_x[npts * 14] = bf_x*z*z*z*z; + + eval_y[npts * 0] = bf_y*x*x*x*x; + eval_y[npts * 1] = x*x*x*(bf + bf_y*y); + eval_y[npts * 2] = bf_y*x*x*x*z; + eval_y[npts * 3] = x*x*y*(2*bf + bf_y*y); + eval_y[npts * 4] = x*x*z*(bf + bf_y*y); + eval_y[npts * 5] = bf_y*x*x*z*z; + eval_y[npts * 6] = x*y*y*(3*bf + bf_y*y); + eval_y[npts * 7] = x*y*z*(2*bf + bf_y*y); + eval_y[npts * 8] = x*z*z*(bf + bf_y*y); + eval_y[npts * 9] = bf_y*x*z*z*z; + eval_y[npts * 10] = y*y*y*(4*bf + bf_y*y); + eval_y[npts * 11] = y*y*z*(3*bf + bf_y*y); + eval_y[npts * 12] = y*z*z*(2*bf + bf_y*y); + eval_y[npts * 13] = z*z*z*(bf + bf_y*y); + eval_y[npts * 14] = bf_y*z*z*z*z; + + eval_z[npts * 0] = bf_z*x*x*x*x; + eval_z[npts * 1] = bf_z*x*x*x*y; + eval_z[npts * 2] = x*x*x*(bf + bf_z*z); + eval_z[npts * 3] = bf_z*x*x*y*y; + eval_z[npts * 4] = x*x*y*(bf + bf_z*z); + eval_z[npts * 5] = x*x*z*(2*bf + bf_z*z); + eval_z[npts * 6] = bf_z*x*y*y*y; + eval_z[npts * 7] = x*y*y*(bf + bf_z*z); + eval_z[npts * 8] = x*y*z*(2*bf + bf_z*z); + eval_z[npts * 9] = x*z*z*(3*bf + bf_z*z); + eval_z[npts * 10] = bf_z*y*y*y*y; + eval_z[npts * 11] = y*y*y*(bf + bf_z*z); + eval_z[npts * 12] = y*y*z*(2*bf + bf_z*z); + eval_z[npts * 13] = y*z*z*(3*bf + bf_z*z); + eval_z[npts * 14] = z*z*z*(4*bf + bf_z*z); + +} + template GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( @@ -255,6 +348,10 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_cartesian_angular_4( npts, bf, x, y, z, eval ); + } else { assert( false && "L < L_MAX" ); } @@ -300,6 +397,11 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_deriv1( collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); collocation_cartesian_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_cartesian_angular_4( npts, bf, x, y, z, eval ); + collocation_cartesian_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp index e6f102d6..968cc3c8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -187,17 +187,17 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( eval_x[npts * 0] = sqrt_10*y*(6*bf*x + bf_x*(3*x*x - y*y))/4; eval_x[npts * 1] = sqrt_15*y*z*(bf + bf_x*x); - eval_x[npts * 2] = -sqrt_6*y*(2*bf*x + bf_x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 3] = -z*(6*bf*x + bf_x*(3*x*x + 3*y*y - 2*z*z))/2; - eval_x[npts * 4] = -sqrt_6*(bf*(3*x*x + y*y - 4*z*z) + bf_x*x*(x*x + y*y - 4*z*z))/4; + eval_x[npts * 2] = sqrt_6*y*(-2*bf*x - bf_x*(x*x + y*y - 4*z*z))/4; + eval_x[npts * 3] = z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 2*z*z))/2; + eval_x[npts * 4] = sqrt_6*(-bf*(3*x*x + y*y - 4*z*z) - bf_x*x*(x*x + y*y - 4*z*z))/4; eval_x[npts * 5] = sqrt_15*z*(2*bf*x + bf_x*(x*x - y*y))/2; eval_x[npts * 6] = sqrt_10*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; eval_y[npts * 0] = sqrt_10*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; eval_y[npts * 1] = sqrt_15*x*z*(bf + bf_y*y); - eval_y[npts * 2] = -sqrt_6*(bf*(x*x + 3*y*y - 4*z*z) + bf_y*y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 3] = -z*(6*bf*y + bf_y*(3*x*x + 3*y*y - 2*z*z))/2; - eval_y[npts * 4] = -sqrt_6*x*(2*bf*y + bf_y*(x*x + y*y - 4*z*z))/4; + eval_y[npts * 2] = sqrt_6*(-bf*(x*x + 3*y*y - 4*z*z) - bf_y*y*(x*x + y*y - 4*z*z))/4; + eval_y[npts * 3] = z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 2*z*z))/2; + eval_y[npts * 4] = sqrt_6*x*(-2*bf*y - bf_y*(x*x + y*y - 4*z*z))/4; eval_y[npts * 5] = sqrt_15*z*(-2*bf*y + bf_y*(x*x - y*y))/2; eval_y[npts * 6] = sqrt_10*x*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; @@ -211,6 +211,75 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = sqrt_35*bf*x*y*(x*x - y*y)/2; + eval[npts * 1] = sqrt_70*bf*y*z*(3*x*x - y*y)/4; + eval[npts * 2] = sqrt_5*bf*x*y*(-x*x - y*y + 6*z*z)/2; + eval[npts * 3] = sqrt_10*bf*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 4] = bf*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval[npts * 5] = sqrt_10*bf*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 6] = sqrt_5*bf*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + eval[npts * 7] = sqrt_70*bf*x*z*(x*x - 3*y*y)/4; + eval[npts * 8] = sqrt_35*bf*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = sqrt_35*y*(bf*(3*x*x - y*y) + bf_x*x*(x*x - y*y))/2; + eval_x[npts * 1] = sqrt_70*y*z*(6*bf*x + bf_x*(3*x*x - y*y))/4; + eval_x[npts * 2] = sqrt_5*y*(-bf*(3*x*x + y*y - 6*z*z) - bf_x*x*(x*x + y*y - 6*z*z))/2; + eval_x[npts * 3] = sqrt_10*y*z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 4] = 3*bf*x*(x*x + y*y - 4*z*z)/2 + bf_x*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_x[npts * 5] = sqrt_10*z*(-bf*(9*x*x + 3*y*y - 4*z*z) - bf_x*x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 6] = sqrt_5*(-bf*x*(x*x - 3*z*z) - bf_x*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_x[npts * 7] = sqrt_70*z*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; + eval_x[npts * 8] = sqrt_35*(4*bf*x*(x*x - 3*y*y) + bf_x*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_y[npts * 0] = sqrt_35*x*(-bf*(-x*x + 3*y*y) + bf_y*y*(x*x - y*y))/2; + eval_y[npts * 1] = sqrt_70*z*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; + eval_y[npts * 2] = sqrt_5*x*(-bf*(x*x + 3*y*y - 6*z*z) - bf_y*y*(x*x + y*y - 6*z*z))/2; + eval_y[npts * 3] = sqrt_10*z*(-bf*(3*x*x + 9*y*y - 4*z*z) - bf_y*y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 4] = 3*bf*y*(x*x + y*y - 4*z*z)/2 + bf_y*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_y[npts * 5] = sqrt_10*x*z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 6] = sqrt_5*(bf*y*(y*y - 3*z*z) - bf_y*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_y[npts * 7] = sqrt_70*x*z*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; + eval_y[npts * 8] = sqrt_35*(-4*bf*y*(3*x*x - y*y) + bf_y*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_z[npts * 0] = sqrt_35*bf_z*x*y*(x*x - y*y)/2; + eval_z[npts * 1] = sqrt_70*y*(bf + bf_z*z)*(3*x*x - y*y)/4; + eval_z[npts * 2] = sqrt_5*x*y*(12*bf*z - bf_z*(x*x + y*y - 6*z*z))/2; + eval_z[npts * 3] = sqrt_10*y*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 4] = -2*bf*z*(3*x*x + 3*y*y - 2*z*z) + bf_z*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_z[npts * 5] = sqrt_10*x*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 6] = sqrt_5*(12*bf*z*(x*x - y*y) - bf_z*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + eval_z[npts * 7] = sqrt_70*x*(bf + bf_z*z)*(x*x - 3*y*y)/4; + eval_z[npts * 8] = sqrt_35*bf_z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + template GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( @@ -239,6 +308,10 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + } else { assert( false && "L < L_MAX" ); } @@ -284,6 +357,11 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + collocation_spherical_unnorm_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp index 98d180dc..d265a8d4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp @@ -9,9 +9,12 @@ namespace GauXC { - constexpr double sqrt_15 = 3.872983346207417; + constexpr double sqrt_10 = 3.1622776601683795; constexpr double sqrt_3 = 1.7320508075688772; + constexpr double sqrt_15 = 3.872983346207417; + constexpr double sqrt_35 = 5.916079783099616; constexpr double sqrt_6 = 2.449489742783178; - constexpr double sqrt_10 = 3.1622776601683795; + constexpr double sqrt_70 = 8.366600265340756; + constexpr double sqrt_5 = 2.23606797749979; } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp new file mode 100644 index 00000000..2a0f65d9 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp @@ -0,0 +1,175 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + } + + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp new file mode 100644 index 00000000..878c3f4f --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp @@ -0,0 +1,330 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + } + + radial_eval_alpha *= -2; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; + basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; + basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; + basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; + basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; + basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; + basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; + basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; + basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; + basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; + basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; + basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; + basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; + basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; + dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; + dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; + dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; + dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; + dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; + dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; + dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; + dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; + dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + basis_x_eval[ipt + 10*npts] = dang_eval_x_2; + basis_y_eval[ipt + 10*npts] = dang_eval_y_2; + basis_z_eval[ipt + 10*npts] = dang_eval_z_2; + basis_x_eval[ipt + 11*npts] = dang_eval_x_3; + basis_y_eval[ipt + 11*npts] = dang_eval_y_3; + basis_z_eval[ipt + 11*npts] = dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; + dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; + dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; + dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 12*npts] = dang_eval_x_0; + basis_y_eval[ipt + 12*npts] = dang_eval_y_0; + basis_z_eval[ipt + 12*npts] = dang_eval_z_0; + basis_x_eval[ipt + 13*npts] = dang_eval_x_1; + basis_y_eval[ipt + 13*npts] = dang_eval_y_1; + basis_z_eval[ipt + 13*npts] = dang_eval_z_1; + basis_x_eval[ipt + 14*npts] = dang_eval_x_2; + basis_y_eval[ipt + 14*npts] = dang_eval_y_2; + basis_z_eval[ipt + 14*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp new file mode 100644 index 00000000..5e5ac1c0 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp @@ -0,0 +1,440 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; + basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; + basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; + basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; + basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; + basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; + basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; + basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; + basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; + basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; + basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; + basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; + basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; + basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x*x*(12*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 1*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 2*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 3*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 4*npts] = y*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 5*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 6*npts] = x*y*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 9*npts] = x*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 11*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 13*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x*x*x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 1*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 2*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 3*npts] = x*y*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 4*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 5*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 6*npts] = y*y*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 7*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 9*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 10*npts] = x*y*y*y*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 11*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 12*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 13*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x*y*z*z*z*z; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x*x*x*z*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 1*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 2*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 3*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 4*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 5*npts] = x*z*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 6*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 8*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 9*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x*y*y*y*y*z; + basis_xz_eval[ipt + 11*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 12*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 13*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 14*npts] = x*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 1*npts] = x*x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 2*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 3*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 6*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 7*npts] = x*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 9*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 10*npts] = y*y*(12*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 11*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 12*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 13*npts] = y*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*x*y*z; + basis_yz_eval[ipt + 1*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 2*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 3*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 4*npts] = x*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 5*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 6*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 7*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 8*npts] = x*z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 9*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 10*npts] = y*y*y*z*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 11*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 12*npts] = y*z*(4*radial_eval + 2*radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 13*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 14*npts] = y*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 1*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 2*npts] = x*x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 5*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 6*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 8*npts] = x*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 9*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 11*npts] = y*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 12*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 13*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 14*npts] = z*z*(12*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; + dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; + dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; + dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; + dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; + dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; + dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; + dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; + dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; + dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + basis_x_eval[ipt + 10*npts] = dang_eval_x_2; + basis_y_eval[ipt + 10*npts] = dang_eval_y_2; + basis_z_eval[ipt + 10*npts] = dang_eval_z_2; + basis_x_eval[ipt + 11*npts] = dang_eval_x_3; + basis_y_eval[ipt + 11*npts] = dang_eval_y_3; + basis_z_eval[ipt + 11*npts] = dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; + dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; + dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; + dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 12*npts] = dang_eval_x_0; + basis_y_eval[ipt + 12*npts] = dang_eval_y_0; + basis_z_eval[ipt + 12*npts] = dang_eval_z_0; + basis_x_eval[ipt + 13*npts] = dang_eval_x_1; + basis_y_eval[ipt + 13*npts] = dang_eval_y_1; + basis_z_eval[ipt + 13*npts] = dang_eval_z_1; + basis_x_eval[ipt + 14*npts] = dang_eval_x_2; + basis_y_eval[ipt + 14*npts] = dang_eval_y_2; + basis_z_eval[ipt + 14*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp index 50e904f0..320d1f2a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp @@ -113,14 +113,14 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; @@ -170,8 +170,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp index abf51d9b..b2224b32 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp @@ -122,14 +122,14 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; @@ -150,7 +150,7 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt xy basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); basis_xy_eval[ipt + 1*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 2*npts] = -x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; + basis_xy_eval[ipt + 2*npts] = x*y*(-4*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; basis_xy_eval[ipt + 3*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); basis_xy_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha_squared*x*y*(x*x - y*y)/2; @@ -220,8 +220,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp index c6331457..dbe7b066 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp @@ -115,18 +115,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; @@ -182,11 +182,11 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; @@ -201,8 +201,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp index 8f7c337c..1d7165a8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp @@ -124,18 +124,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; @@ -151,18 +151,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt xx basis_xx_eval[ipt + 0*npts] = sqrt_10*y*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; basis_xx_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = -sqrt_6*y*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; - basis_xx_eval[ipt + 3*npts] = -z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xx_eval[ipt + 4*npts] = -sqrt_6*x*(6*radial_eval + 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; + basis_xx_eval[ipt + 2*npts] = sqrt_6*y*(-2*radial_eval - 4*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; + basis_xx_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2; + basis_xx_eval[ipt + 4*npts] = sqrt_6*x*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; basis_xx_eval[ipt + 5*npts] = sqrt_15*z*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; basis_xx_eval[ipt + 6*npts] = sqrt_10*x*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; // Evaluate second derivative of bfn wrt xy basis_xy_eval[ipt + 0*npts] = sqrt_10*x*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; basis_xy_eval[ipt + 1*npts] = sqrt_15*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 2*npts] = -sqrt_6*x*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; - basis_xy_eval[ipt + 3*npts] = -x*y*z*(12*radial_eval_alpha + radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xy_eval[ipt + 4*npts] = -sqrt_6*y*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*(3*x*x + y*y - 4*z*z) + radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; + basis_xy_eval[ipt + 2*npts] = sqrt_6*x*(-2*radial_eval - 2*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; + basis_xy_eval[ipt + 3*npts] = x*y*z*(-12*radial_eval_alpha - radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2; + basis_xy_eval[ipt + 4*npts] = sqrt_6*y*(-2*radial_eval - 2*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; basis_xy_eval[ipt + 5*npts] = sqrt_15*radial_eval_alpha_squared*x*y*z*(x*x - y*y)/2; basis_xy_eval[ipt + 6*npts] = sqrt_10*y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; @@ -178,9 +178,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt yy basis_yy_eval[ipt + 0*npts] = sqrt_10*y*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; basis_yy_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = -sqrt_6*y*(6*radial_eval + 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; - basis_yy_eval[ipt + 3*npts] = -z*(6*radial_eval + 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_yy_eval[ipt + 4*npts] = -sqrt_6*x*(2*radial_eval + 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; + basis_yy_eval[ipt + 2*npts] = sqrt_6*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; + basis_yy_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2; + basis_yy_eval[ipt + 4*npts] = sqrt_6*x*(-2*radial_eval - 4*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; basis_yy_eval[ipt + 5*npts] = sqrt_15*z*(-2*radial_eval - 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; basis_yy_eval[ipt + 6*npts] = sqrt_10*x*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; @@ -244,11 +244,11 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; @@ -263,8 +263,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp new file mode 100644 index 00000000..075d943e --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp @@ -0,0 +1,156 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + } + + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp new file mode 100644 index 00000000..0dc3f241 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp @@ -0,0 +1,256 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + } + + radial_eval_alpha *= -2; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp new file mode 100644 index 00000000..01b2f7cc --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp @@ -0,0 +1,330 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = sqrt_35*x*y*(6*radial_eval + 2*radial_eval_alpha*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; + basis_xx_eval[ipt + 1*npts] = sqrt_70*y*z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; + basis_xx_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 6*z*z))/2; + basis_xx_eval[ipt + 3*npts] = sqrt_10*y*z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xx_eval[ipt + 4*npts] = 3*radial_eval*(3*x*x + y*y - 4*z*z)/2 + 3*radial_eval_alpha*x*x*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_xx_eval[ipt + 5*npts] = sqrt_10*x*z*(-18*radial_eval - 2*radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xx_eval[ipt + 6*npts] = sqrt_5*(-12*radial_eval*(x*x - z*z) - 8*radial_eval_alpha*x*x*(x*x - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_xx_eval[ipt + 7*npts] = sqrt_70*x*z*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; + basis_xx_eval[ipt + 8*npts] = sqrt_35*(12*radial_eval*(x*x - y*y) + 8*radial_eval_alpha*x*x*(x*x - 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = sqrt_35*(3*radial_eval*x*x - 3*radial_eval*y*y + radial_eval_alpha*x*x*x*x - radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*y*y - radial_eval_alpha_squared*x*x*y*y*y*y)/2; + basis_xy_eval[ipt + 1*npts] = sqrt_70*x*z*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; + basis_xy_eval[ipt + 2*npts] = sqrt_5*(-3*radial_eval*(x*x + y*y - 2*z*z) - radial_eval_alpha*x*x*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*y*y*(x*x + y*y - 6*z*z))/2; + basis_xy_eval[ipt + 3*npts] = sqrt_10*x*z*(-6*radial_eval - 6*radial_eval_alpha*y*y - radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xy_eval[ipt + 4*npts] = x*y*(24*radial_eval + 24*radial_eval_alpha*(x*x + y*y - 4*z*z) + radial_eval_alpha_squared*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_xy_eval[ipt + 5*npts] = sqrt_10*y*z*(-6*radial_eval - 6*radial_eval_alpha*x*x - radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xy_eval[ipt + 6*npts] = sqrt_5*x*y*(-4*radial_eval_alpha*x*x + 4*radial_eval_alpha*y*y - radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*y*y - 6*radial_eval_alpha_squared*y*y*z*z)/4; + basis_xy_eval[ipt + 7*npts] = sqrt_70*y*z*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; + basis_xy_eval[ipt + 8*npts] = sqrt_35*x*y*(-24*radial_eval - 8*radial_eval_alpha*x*x - 8*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y)/8; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = sqrt_35*y*z*(radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - y*y))/2; + basis_xz_eval[ipt + 1*npts] = sqrt_70*x*y*(6*radial_eval + 6*radial_eval_alpha*z*z + radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*z*z*(3*x*x - y*y))/4; + basis_xz_eval[ipt + 2*npts] = sqrt_5*y*z*(12*radial_eval + 12*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 6*z*z))/2; + basis_xz_eval[ipt + 3*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xz_eval[ipt + 4*npts] = x*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; + basis_xz_eval[ipt + 5*npts] = sqrt_10*(-3*radial_eval*(3*x*x + y*y - 4*z*z) + 3*radial_eval_alpha*x*x*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xz_eval[ipt + 6*npts] = sqrt_5*x*z*(24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) - 4*radial_eval_alpha*(x*x - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_xz_eval[ipt + 7*npts] = sqrt_70*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y) + 3*radial_eval_alpha*z*z*(x*x - y*y) + radial_eval_alpha_squared*x*x*z*z*(x*x - 3*y*y))/4; + basis_xz_eval[ipt + 8*npts] = sqrt_35*x*z*(4*radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = sqrt_35*x*y*(-6*radial_eval - 2*radial_eval_alpha*(-x*x + 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; + basis_yy_eval[ipt + 1*npts] = sqrt_70*y*z*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; + basis_yy_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 6*z*z))/2; + basis_yy_eval[ipt + 3*npts] = sqrt_10*y*z*(-18*radial_eval - 2*radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yy_eval[ipt + 4*npts] = 3*radial_eval*(x*x + 3*y*y - 4*z*z)/2 + 3*radial_eval_alpha*y*y*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_yy_eval[ipt + 5*npts] = sqrt_10*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yy_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(y*y - z*z) + 8*radial_eval_alpha*y*y*(y*y - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_yy_eval[ipt + 7*npts] = sqrt_70*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; + basis_yy_eval[ipt + 8*npts] = sqrt_35*(-12*radial_eval*(x*x - y*y) - 8*radial_eval_alpha*y*y*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = sqrt_35*x*z*(-radial_eval_alpha*(-x*x + 3*y*y) + radial_eval_alpha_squared*y*y*(x*x - y*y))/2; + basis_yz_eval[ipt + 1*npts] = sqrt_70*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y) - 3*radial_eval_alpha*z*z*(-x*x + y*y) + radial_eval_alpha_squared*y*y*z*z*(3*x*x - y*y))/4; + basis_yz_eval[ipt + 2*npts] = sqrt_5*x*z*(12*radial_eval + 12*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 6*z*z))/2; + basis_yz_eval[ipt + 3*npts] = sqrt_10*(-3*radial_eval*(x*x + 3*y*y - 4*z*z) + 3*radial_eval_alpha*y*y*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yz_eval[ipt + 4*npts] = y*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; + basis_yz_eval[ipt + 5*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yz_eval[ipt + 6*npts] = sqrt_5*y*z*(-24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) + 4*radial_eval_alpha*(y*y - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_yz_eval[ipt + 7*npts] = sqrt_70*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*z*z*(x*x - 3*y*y))/4; + basis_yz_eval[ipt + 8*npts] = sqrt_35*y*z*(-4*radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = sqrt_35*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2; + basis_zz_eval[ipt + 1*npts] = sqrt_70*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4; + basis_zz_eval[ipt + 2*npts] = sqrt_5*x*y*(12*radial_eval + 24*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 6*z*z))/2; + basis_zz_eval[ipt + 3*npts] = sqrt_10*y*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_zz_eval[ipt + 4*npts] = -6*radial_eval*(x*x + y*y - 2*z*z) - 4*radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_zz_eval[ipt + 5*npts] = sqrt_10*x*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_zz_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(x*x - y*y) + 24*radial_eval_alpha*z*z*(x*x - y*y) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_zz_eval[ipt + 7*npts] = sqrt_70*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4; + basis_zz_eval[ipt + 8*npts] = sqrt_35*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu index bb945bab..ab8e5c70 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu @@ -17,6 +17,8 @@ #include "device_specific/cuda_device_constants.hpp" +#define GAUXC_CUDA_MAX_L 4 + namespace GauXC { @@ -254,13 +256,16 @@ uint32_t max_threads_shell_to_task_collocation( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_0 ); case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_2 ); - case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -289,6 +294,10 @@ void dispatch_shell_to_task_collocation( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_spherical_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -304,6 +313,10 @@ void dispatch_shell_to_task_collocation( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_cartesian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } } @@ -338,6 +351,8 @@ uint32_t max_threads_shell_to_task_collocation_gradient( int32_t l, bool pure ) case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -345,6 +360,8 @@ uint32_t max_threads_shell_to_task_collocation_gradient( int32_t l, bool pure ) case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -373,6 +390,10 @@ void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l case 3: collocation_device_shell_to_task_kernel_spherical_gradient_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_gradient_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -388,6 +409,10 @@ void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l case 3: collocation_device_shell_to_task_kernel_cartesian_gradient_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_gradient_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } @@ -423,6 +448,8 @@ uint32_t max_threads_shell_to_task_collocation_hessian( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -430,6 +457,8 @@ uint32_t max_threads_shell_to_task_collocation_hessian( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -458,6 +487,10 @@ void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_spherical_hessian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_hessian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -473,6 +506,10 @@ void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_cartesian_hessian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_hessian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp index 5d2d3e5a..b2848053 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp @@ -11,28 +11,34 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp"